Example #1
0
def organize_hits_paf(PAF_path):
    '''
    Group hits by query name into a dictionary

    Input:
        1. PAF_path: Path to paf file containing alignments

    Output:
        1. hits: dictionary containing query names as keys and the list of alignments for each query as values
    '''
    ## 1. Read PAF
    PAF = formats.PAF()
    PAF.read(PAF_path)

    ## 2. Organize hits by query name into the dictionary
    hits = {}

    # For each read alignment
    for alignment in PAF.alignments:

        # First hit for this query, initialize PAF
        if alignment.qName not in hits:
            PAF = formats.PAF()
            hits[alignment.qName] = PAF

        # Add hit to PAF
        hits[alignment.qName].alignments.append(alignment)

    return hits
Example #2
0
def alignments2PAF(alignments):
    '''
    Convert as set of pysam aligned segments into a PAF object

	Input:
		1. alignments: list of aligned segments

	Output:
		1. PAF: PAF object containing alignments
    '''

    ## 1. Initialize PAF object
    PAF = formats.PAF()

    ## 2. Convert each aligned segment into a PAF_alignment object and add to PAF
    for alignment in alignments:

        # Discard unmapped sequences
        if not alignment.is_unmapped:

            strand = '-' if alignment.is_reverse else '+'
            fields = [
                alignment.query_name,
                alignment.infer_read_length(), alignment.query_alignment_start,
                alignment.query_alignment_end, strand,
                alignment.reference_name, alignment.reference_length,
                alignment.reference_start, alignment.reference_end, 0, 0,
                alignment.mapping_quality
            ]
            line = formats.PAF_alignment(fields)
            PAF.alignments.append(line)

    return PAF
Example #3
0
def aligmentMaxNbMatches(FASTA_file, db, PAF_file, outDir):
    '''
    '''

    # DESILENCIAAAAAR!!!

    # TODO: append en el error!
    err = open(outDir + '/identifyMate.err', 'w')
    command = 'minimap2 ' + db + ' ' + FASTA_file + ' > ' + PAF_file
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'IDENTIFY MATE SEQ'
        msg = 'Identify mate sequence failed'
        log.step(step, msg)

    # If PAF file is not empty
    if not os.stat(PAF_file).st_size == 0:
        PAFObj = formats.PAF()
        PAFObj.read(PAF_file)

        # Pick the identity of the aligment with highest number of matches
        aligmentMaxNbMatches = PAFObj.sortNbMatches()[0]

    else:
        aligmentMaxNbMatches = None

    return aligmentMaxNbMatches
Example #4
0
def search4partnered_5prime(structures, fasta, reference, outDir):
    '''
    '''
    ## 1. Create Fasta with sequences to realign
    seq2realign = formats.FASTA()

    for insId in structures:

        # Discard if strand not determined
        if structures[insId]['STRAND'] is None:
            continue

        ## Extract unresolved 5' sequence if any
        qBeg, qEnd = structures[insId]['CHAIN'].interval()

        if structures[insId]['STRAND'] == '+':
            seq2realign.seqDict[insId] = fasta.seqDict[insId][:qBeg]

        else:
            seq2realign.seqDict[insId] = fasta.seqDict[insId][qEnd:]

    fastaPath = outDir + '/seq2realign.5prime.fasta'
    seq2realign.write(fastaPath)

    ## 2. Realign sequences on the reference with BWA-mem
    SAM_path = alignment.alignment_bwa(fastaPath, reference,
                                       'hits2genome.5prime', 1, outDir)
    PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.5prime', outDir)

    PAF = formats.PAF()
    PAF.read(PAF_path)

    ## 3. Make 5' transduction calls
    # For each hit
    for hit in PAF.alignments:

        hit.tName = 'chr' + hit.tName
        iRef, coord = hit.qName.split(':')
        iBeg = int(coord) - 500
        iEnd = int(coord) + 500

        ## Filter out hits
        if (hit.alignmentPerc() < 75) or (hit.MAPQ < 30) or (
                iRef == hit.tName
                and gRanges.overlap(iBeg, iEnd, hit.tBeg, hit.tEnd)[0]):
            continue

        ## Make call
        structures[hit.qName]['ITYPE'] = 'partnered'
        structures[hit.qName]['5PRIME'] = True
        structures[hit.qName]['TDCOORD_5PRIME'] = hit.tName + ':' + str(
            hit.tBeg) + '-' + str(hit.tEnd)
        structures[hit.qName]['TDLEN_5PRIME'] = hit.tEnd - hit.tBeg

    return structures
Example #5
0
def group_alignments(paf):
    '''

    '''
    pafDict = {}

    ## For each hit
    for hit in paf.alignments:

        # Initialize paf object for this inserted sequence
        if hit.qName not in pafDict:
            pafDict[hit.qName] = formats.PAF()

        # Add hit to the corresponding paf
        pafDict[hit.qName].alignments.append(hit)

    return pafDict
Example #6
0
def retrotransposon_structure(FASTA_file, index, outDir):
    '''    
    Infer the insertion size, structure, poly-A, target site duplication length and other insertion structural features

    Input:
        1. FASTA_file: Path to FASTA file containing the sequence
        2. index: Minimap2 index for consensus retrotransposon sequences database
        3. outDir: Output directory
        
    Output:
        1. structure: dictionary containing insertion structure information
    '''
    structure = {}

    ## 0. Create logs directory ##
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Align the sequence into the retrotransposon sequences database ##
    PAF_file = alignment.alignment_minimap2(FASTA_file, index,
                                            'alignment2consensus', 1, outDir)

    ## 2. Read PAF alignments ##
    PAF = formats.PAF()
    PAF.read(PAF_file)

    # Exit function if no hit on the retrotransposons database
    if not PAF.alignments:
        return structure

    ## 3. Chain complementary alignments ##
    chain = PAF.chain(100, 20)

    ## 4. Infer insertion features ##
    ## Retrieve inserted seq
    FASTA = formats.FASTA()
    FASTA.read(FASTA_file)
    sequence = list(FASTA.seqDict.values())[0]

    ## 4.1 Insertion type
    structure['INS_TYPE'], structure['FAMILY'], structure[
        'CYTOBAND'] = insertion_type(chain)

    ## 4.2 Insertion strand
    structure['STRAND'], structure['POLYA'] = infer_strand(
        structure['INS_TYPE'], sequence, chain)

    ## 4.3 Sequence lengths
    lengths = infer_lengths(structure['INS_TYPE'], chain, structure['STRAND'])
    structure.update(lengths)

    ## 4.4 Insertion mechanism (TPRT or EI)
    structure['MECHANISM'] = infer_integration_mechanism(
        chain, structure['TRUNCATION_3_LEN'], structure['POLYA'])

    ## 4.5 Target site duplication (TO DO LATER...)
    #search4tsd()

    ## 4.6 Percentage resolved
    structure['PERC_RESOLVED'] = chain.perc_query_covered()

    return structure
Example #7
0
def call_NUMT(vcf, mtGenome, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/insertions.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for the mitochondrial genome
    fileName = 'mtGenome'
    mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir)

    ## 3. Align inserted sequences against the mitochondrial genome
    PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1,
                                            tmpDir)
    PAF_mt = formats.PAF()
    PAF_mt.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_mt = group_alignments(PAF_mt)

    ## 5. Make NUMTs calls
    NUMTs = {}

    for insId in PAFs_mt:
        chain = PAFs_mt[insId].chain(20, 50)

        # Make NUMT call if enough % of sequence resolved
        if chain.perc_query_covered() >= 60:

            coords = chain.interval_template()

            NUMT = {}
            NUMT['ITYPE'] = 'NUMT'
            NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1])
            NUMTs[insId] = NUMT

    ## 6. Generate output VCF containing NUMT calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'],
                }
    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in NUMTs):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(NUMTs[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF
Example #8
0
def resolve_partnered_3prime(structures, fasta, reference, sourceDb, outDir):
    '''
    '''
    ## 1. Create Fasta with sequences to realign
    seq2realign = formats.FASTA()
    pattern = re.compile("Partnered_[0-9]+")
    partneredDict = {}

    for insId in structures:

        # Discard solo
        if structures[insId]['ITYPE'] != 'partnered':
            continue

        ## Initialize partnered dict for the ins
        partneredDict[insId] = {}
        partneredDict[insId]['NB_PARTNERED'] = 0
        partneredDict[insId]['NB_RESOLVED'] = 0

        # For each hit
        for hit in structures[insId]['CHAIN'].alignments:

            # Discard if not partnered
            if not pattern.match(hit.tName):
                continue

            # Add candidate partnered sequence to the fasta
            seqId = insId + '|' + hit.tName
            seq = fasta.seqDict[insId][hit.qBeg:hit.qEnd]
            seq2realign.seqDict[seqId] = seq

            ## Update partnered dictionary
            partneredDict[insId]['NB_PARTNERED'] += 1
            partneredDict[insId][hit.tName] = hit

    fastaPath = outDir + '/seq2realign.3prime.fasta'
    seq2realign.write(fastaPath)

    ## 2. Realign sequences on the reference with BWA-mem
    SAM_path = alignment.alignment_bwa(fastaPath, reference,
                                       'hits2genome.3prime', 1, outDir)
    PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.3prime', outDir)
    PAF = formats.PAF()
    PAF.read(PAF_path)

    ## 3. Add hit information to partnered transduction candidates
    hits = PAF.hits2dict()

    # For each partnered event
    for ID in hits:

        insId, tdId = ID.split('|')
        partneredDict[insId]['CYTOID'] = None

        # For each hit
        for hit in hits[ID]:

            hit.tName = 'chr' + hit.tName

            ## Check if it´s a partnered transduction from a known source element
            if (hit.tName
                    in sourceDb) and sourceDb[hit.tName].collect_interval(
                        hit.tBeg - 200, hit.tEnd + 200, 'ALL'):
                source = sourceDb[hit.tName].collect_interval(
                    hit.tBeg - 200, hit.tEnd + 200, 'ALL')[0][0]
                partneredDict[insId]['CYTOID'] = source.optional['cytobandId']

            ## Filter out hits
            if (hit.alignmentPerc() < 75 or hit.MAPQ < 30) and (
                    partneredDict[insId]['CYTOID'] is None):
                continue

            ## Add hit information
            partneredDict[insId]['NB_RESOLVED'] += 1
            partneredDict[insId][tdId].tName = hit.tName + ':' + str(
                hit.tBeg) + '-' + str(hit.tEnd)

    ## 4. Add transduction information
    for insId in partneredDict:

        # a) Make transduction call
        if (partneredDict[insId]['NB_PARTNERED'] >
                0) and (partneredDict[insId]['NB_PARTNERED']
                        == partneredDict[insId]['NB_RESOLVED']):

            tdIds = [
                key for key in partneredDict[insId].keys()
                if key not in ['NB_PARTNERED', 'NB_RESOLVED', 'CYTOID']
            ]
            structures[insId]['CYTOID'] = partneredDict[insId]['CYTOID']
            structures[insId]['TDCOORD_3PRIME'] = ','.join(
                [partneredDict[insId][tdId].tName for tdId in tdIds])
            structures[insId]['TDLEN_3PRIME'] = ','.join([
                str(partneredDict[insId][tdId].qEnd -
                    partneredDict[insId][tdId].qBeg) for tdId in tdIds
            ])

        # b) Make solo call
        else:
            structures[insId]['ITYPE'] = 'solo'

    return structures
Example #9
0
def call_MEI(vcf, consensus, reference, sourceDb, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/MEI_candidate.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for consensus sequences
    fileName = 'consensus'
    consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir)

    ## 3. Align inserted sequences against consensus:
    PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex,
                                            'hits2consensus', 1, tmpDir)
    PAF_consensus = formats.PAF()
    PAF_consensus.read(PAF_path)

    ## Temporary
    index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi"
    PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI',
                                            1, tmpDir)

    ## Align inserted sequences against the reference genome
    #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir)
    #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir)
    #PAF_genome = formats.PAF()
    #PAF_genome.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_consensus = group_alignments(PAF_consensus)
    #PAFs_genome = group_alignments(PAF_genome)

    ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences
    structures = {}

    for insId in PAFs_consensus:
        structures[insId] = MEI_structure(PAFs_consensus[insId],
                                          fasta.seqDict[insId])
        seqBeg, seqEnd = structures[insId]['CHAIN'].interval()

    ## 6. Resolve 3' partnered transductions
    structures = resolve_partnered_3prime(structures, fasta, reference,
                                          sourceDb, tmpDir)

    ## 6. Search for 5' partnered transductions
    structures = search4partnered_5prime(structures, fasta, reference, tmpDir)

    ## 7. Search for orphan transductions
    ## Remove resolved insertions
    #for insId in structures:
    #    if structures[insId]['PASS']:
    #        del PAFs_genome[insId]

    ## Do orphan transduction search
    #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..)

    ## 8. Generate output VCF containing MEI calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered,  orphan or NUMT)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \
                'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated']
                }

    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in structures) or ((insId in structures) and
                                         (structures[insId]['PASS'] is False)):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(structures[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF