def organize_hits_paf(PAF_path): ''' Group hits by query name into a dictionary Input: 1. PAF_path: Path to paf file containing alignments Output: 1. hits: dictionary containing query names as keys and the list of alignments for each query as values ''' ## 1. Read PAF PAF = formats.PAF() PAF.read(PAF_path) ## 2. Organize hits by query name into the dictionary hits = {} # For each read alignment for alignment in PAF.alignments: # First hit for this query, initialize PAF if alignment.qName not in hits: PAF = formats.PAF() hits[alignment.qName] = PAF # Add hit to PAF hits[alignment.qName].alignments.append(alignment) return hits
def alignments2PAF(alignments): ''' Convert as set of pysam aligned segments into a PAF object Input: 1. alignments: list of aligned segments Output: 1. PAF: PAF object containing alignments ''' ## 1. Initialize PAF object PAF = formats.PAF() ## 2. Convert each aligned segment into a PAF_alignment object and add to PAF for alignment in alignments: # Discard unmapped sequences if not alignment.is_unmapped: strand = '-' if alignment.is_reverse else '+' fields = [ alignment.query_name, alignment.infer_read_length(), alignment.query_alignment_start, alignment.query_alignment_end, strand, alignment.reference_name, alignment.reference_length, alignment.reference_start, alignment.reference_end, 0, 0, alignment.mapping_quality ] line = formats.PAF_alignment(fields) PAF.alignments.append(line) return PAF
def aligmentMaxNbMatches(FASTA_file, db, PAF_file, outDir): ''' ''' # DESILENCIAAAAAR!!! # TODO: append en el error! err = open(outDir + '/identifyMate.err', 'w') command = 'minimap2 ' + db + ' ' + FASTA_file + ' > ' + PAF_file status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'IDENTIFY MATE SEQ' msg = 'Identify mate sequence failed' log.step(step, msg) # If PAF file is not empty if not os.stat(PAF_file).st_size == 0: PAFObj = formats.PAF() PAFObj.read(PAF_file) # Pick the identity of the aligment with highest number of matches aligmentMaxNbMatches = PAFObj.sortNbMatches()[0] else: aligmentMaxNbMatches = None return aligmentMaxNbMatches
def search4partnered_5prime(structures, fasta, reference, outDir): ''' ''' ## 1. Create Fasta with sequences to realign seq2realign = formats.FASTA() for insId in structures: # Discard if strand not determined if structures[insId]['STRAND'] is None: continue ## Extract unresolved 5' sequence if any qBeg, qEnd = structures[insId]['CHAIN'].interval() if structures[insId]['STRAND'] == '+': seq2realign.seqDict[insId] = fasta.seqDict[insId][:qBeg] else: seq2realign.seqDict[insId] = fasta.seqDict[insId][qEnd:] fastaPath = outDir + '/seq2realign.5prime.fasta' seq2realign.write(fastaPath) ## 2. Realign sequences on the reference with BWA-mem SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome.5prime', 1, outDir) PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.5prime', outDir) PAF = formats.PAF() PAF.read(PAF_path) ## 3. Make 5' transduction calls # For each hit for hit in PAF.alignments: hit.tName = 'chr' + hit.tName iRef, coord = hit.qName.split(':') iBeg = int(coord) - 500 iEnd = int(coord) + 500 ## Filter out hits if (hit.alignmentPerc() < 75) or (hit.MAPQ < 30) or ( iRef == hit.tName and gRanges.overlap(iBeg, iEnd, hit.tBeg, hit.tEnd)[0]): continue ## Make call structures[hit.qName]['ITYPE'] = 'partnered' structures[hit.qName]['5PRIME'] = True structures[hit.qName]['TDCOORD_5PRIME'] = hit.tName + ':' + str( hit.tBeg) + '-' + str(hit.tEnd) structures[hit.qName]['TDLEN_5PRIME'] = hit.tEnd - hit.tBeg return structures
def group_alignments(paf): ''' ''' pafDict = {} ## For each hit for hit in paf.alignments: # Initialize paf object for this inserted sequence if hit.qName not in pafDict: pafDict[hit.qName] = formats.PAF() # Add hit to the corresponding paf pafDict[hit.qName].alignments.append(hit) return pafDict
def retrotransposon_structure(FASTA_file, index, outDir): ''' Infer the insertion size, structure, poly-A, target site duplication length and other insertion structural features Input: 1. FASTA_file: Path to FASTA file containing the sequence 2. index: Minimap2 index for consensus retrotransposon sequences database 3. outDir: Output directory Output: 1. structure: dictionary containing insertion structure information ''' structure = {} ## 0. Create logs directory ## logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Align the sequence into the retrotransposon sequences database ## PAF_file = alignment.alignment_minimap2(FASTA_file, index, 'alignment2consensus', 1, outDir) ## 2. Read PAF alignments ## PAF = formats.PAF() PAF.read(PAF_file) # Exit function if no hit on the retrotransposons database if not PAF.alignments: return structure ## 3. Chain complementary alignments ## chain = PAF.chain(100, 20) ## 4. Infer insertion features ## ## Retrieve inserted seq FASTA = formats.FASTA() FASTA.read(FASTA_file) sequence = list(FASTA.seqDict.values())[0] ## 4.1 Insertion type structure['INS_TYPE'], structure['FAMILY'], structure[ 'CYTOBAND'] = insertion_type(chain) ## 4.2 Insertion strand structure['STRAND'], structure['POLYA'] = infer_strand( structure['INS_TYPE'], sequence, chain) ## 4.3 Sequence lengths lengths = infer_lengths(structure['INS_TYPE'], chain, structure['STRAND']) structure.update(lengths) ## 4.4 Insertion mechanism (TPRT or EI) structure['MECHANISM'] = infer_integration_mechanism( chain, structure['TRUNCATION_3_LEN'], structure['POLYA']) ## 4.5 Target site duplication (TO DO LATER...) #search4tsd() ## 4.6 Percentage resolved structure['PERC_RESOLVED'] = chain.perc_query_covered() return structure
def call_NUMT(vcf, mtGenome, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/insertions.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for the mitochondrial genome fileName = 'mtGenome' mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir) ## 3. Align inserted sequences against the mitochondrial genome PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1, tmpDir) PAF_mt = formats.PAF() PAF_mt.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_mt = group_alignments(PAF_mt) ## 5. Make NUMTs calls NUMTs = {} for insId in PAFs_mt: chain = PAFs_mt[insId].chain(20, 50) # Make NUMT call if enough % of sequence resolved if chain.perc_query_covered() >= 60: coords = chain.interval_template() NUMT = {} NUMT['ITYPE'] = 'NUMT' NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1]) NUMTs[insId] = NUMT ## 6. Generate output VCF containing NUMT calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in NUMTs): continue variant2add = copy.deepcopy(variant) variant2add.info.update(NUMTs[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF
def resolve_partnered_3prime(structures, fasta, reference, sourceDb, outDir): ''' ''' ## 1. Create Fasta with sequences to realign seq2realign = formats.FASTA() pattern = re.compile("Partnered_[0-9]+") partneredDict = {} for insId in structures: # Discard solo if structures[insId]['ITYPE'] != 'partnered': continue ## Initialize partnered dict for the ins partneredDict[insId] = {} partneredDict[insId]['NB_PARTNERED'] = 0 partneredDict[insId]['NB_RESOLVED'] = 0 # For each hit for hit in structures[insId]['CHAIN'].alignments: # Discard if not partnered if not pattern.match(hit.tName): continue # Add candidate partnered sequence to the fasta seqId = insId + '|' + hit.tName seq = fasta.seqDict[insId][hit.qBeg:hit.qEnd] seq2realign.seqDict[seqId] = seq ## Update partnered dictionary partneredDict[insId]['NB_PARTNERED'] += 1 partneredDict[insId][hit.tName] = hit fastaPath = outDir + '/seq2realign.3prime.fasta' seq2realign.write(fastaPath) ## 2. Realign sequences on the reference with BWA-mem SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome.3prime', 1, outDir) PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.3prime', outDir) PAF = formats.PAF() PAF.read(PAF_path) ## 3. Add hit information to partnered transduction candidates hits = PAF.hits2dict() # For each partnered event for ID in hits: insId, tdId = ID.split('|') partneredDict[insId]['CYTOID'] = None # For each hit for hit in hits[ID]: hit.tName = 'chr' + hit.tName ## Check if it´s a partnered transduction from a known source element if (hit.tName in sourceDb) and sourceDb[hit.tName].collect_interval( hit.tBeg - 200, hit.tEnd + 200, 'ALL'): source = sourceDb[hit.tName].collect_interval( hit.tBeg - 200, hit.tEnd + 200, 'ALL')[0][0] partneredDict[insId]['CYTOID'] = source.optional['cytobandId'] ## Filter out hits if (hit.alignmentPerc() < 75 or hit.MAPQ < 30) and ( partneredDict[insId]['CYTOID'] is None): continue ## Add hit information partneredDict[insId]['NB_RESOLVED'] += 1 partneredDict[insId][tdId].tName = hit.tName + ':' + str( hit.tBeg) + '-' + str(hit.tEnd) ## 4. Add transduction information for insId in partneredDict: # a) Make transduction call if (partneredDict[insId]['NB_PARTNERED'] > 0) and (partneredDict[insId]['NB_PARTNERED'] == partneredDict[insId]['NB_RESOLVED']): tdIds = [ key for key in partneredDict[insId].keys() if key not in ['NB_PARTNERED', 'NB_RESOLVED', 'CYTOID'] ] structures[insId]['CYTOID'] = partneredDict[insId]['CYTOID'] structures[insId]['TDCOORD_3PRIME'] = ','.join( [partneredDict[insId][tdId].tName for tdId in tdIds]) structures[insId]['TDLEN_3PRIME'] = ','.join([ str(partneredDict[insId][tdId].qEnd - partneredDict[insId][tdId].qBeg) for tdId in tdIds ]) # b) Make solo call else: structures[insId]['ITYPE'] = 'solo' return structures
def call_MEI(vcf, consensus, reference, sourceDb, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/MEI_candidate.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for consensus sequences fileName = 'consensus' consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir) ## 3. Align inserted sequences against consensus: PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex, 'hits2consensus', 1, tmpDir) PAF_consensus = formats.PAF() PAF_consensus.read(PAF_path) ## Temporary index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi" PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI', 1, tmpDir) ## Align inserted sequences against the reference genome #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir) #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir) #PAF_genome = formats.PAF() #PAF_genome.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_consensus = group_alignments(PAF_consensus) #PAFs_genome = group_alignments(PAF_genome) ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences structures = {} for insId in PAFs_consensus: structures[insId] = MEI_structure(PAFs_consensus[insId], fasta.seqDict[insId]) seqBeg, seqEnd = structures[insId]['CHAIN'].interval() ## 6. Resolve 3' partnered transductions structures = resolve_partnered_3prime(structures, fasta, reference, sourceDb, tmpDir) ## 6. Search for 5' partnered transductions structures = search4partnered_5prime(structures, fasta, reference, tmpDir) ## 7. Search for orphan transductions ## Remove resolved insertions #for insId in structures: # if structures[insId]['PASS']: # del PAFs_genome[insId] ## Do orphan transduction search #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..) ## 8. Generate output VCF containing MEI calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered, orphan or NUMT)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \ 'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated'] } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in structures) or ((insId in structures) and (structures[insId]['PASS'] is False)): continue variant2add = copy.deepcopy(variant) variant2add.info.update(structures[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF