Beispiel #1
0
def call_MEI_candidate(VCF):
    '''
    '''
    ## Create VCF with candidate MEI calls
    candidateVCF = formats.VCF()
    candidateVCF.header = VCF.header

    ## Create VCF with non-candidate MEI
    filteredVCF = formats.VCF()
    filteredVCF.header = VCF.header

    ## For each variant
    for variant in VCF.variants:

        ## Filter calls distinct from INS
        if variant.info['SVTYPE'] != 'INS':
            continue

        ## Search for poly(A) tail at inserted sequence
        polyA, monomerA = search4polyA(variant.alt)

        ## Search for poly(T) tail at inserted sequence
        polyT, monomerT = search4polyT(variant.alt)

        ## a) Filter calls if poly(A) nor poly(T) found
        if not polyA and not polyT:
            filteredVCF.add(variant)

        ## b) Add variant passing all the filters
        else:
            candidateVCF.add(variant)

    return candidateVCF, filteredVCF
Beispiel #2
0
print "fastaFile: ", fastaFile
print "refDir: ", refDir
print "fileName: ", fileName
print "outDir: ", outDir
print
print "***** Executing ", scriptName, ".... *****"
print

## Start ##

#### 1. Read fasta file containing supporting reads
fastaObj = fasta()
fastaObj.fasta_reader(fastaFile)

#### 2. Read VCF file
VCFObj = formats.VCF()
VCFObj.read_VCF(inputVCF)

## For each MEI
for VCFlineObj in VCFObj.lineList:

    ### Define insertion Id
    chrom = VCFlineObj.chrom
    pos = str(VCFlineObj.pos)
    insertionType = VCFlineObj.infoDict["TYPE"]
    family = VCFlineObj.infoDict[
        "CLASS"] if "CLASS" in VCFlineObj.infoDict else 'NA'

    ## Initialize subFamily as unknown
    percDiv = "NA"
    subFamily = "NA"
Beispiel #3
0
        #print "test: ", donorId, tumorType

#print "donorIdProjectCodeDict: ", donorIdProjectCodeDict

#### 2. Compute the allele count of each source element in EOPC-DE
###################################################################
## EOPC-DE is the tumor type with available samples for the validation of L1 source elements.
# Initialize a dictionary with the following structure:
# - dict1: key(sourceElementId) -> dict2: key1("alleleCount") -> value1(alleleCount value)
#                                         key2("donorIdList") -> list of donor ids containing the insertion
# sourceElementId: chr:beg-end

header("2. Compute the allele count of each source element in EOPC-DE")

VCFObj = formats.VCF()
donorIdList = VCFObj.read_VCF_multiSample(sourceElementGt)

alleleCountsDict = {}

## For each MEI:
for MEIObj in VCFObj.lineList:

    end = (MEIObj.infoDict["BKPB"] if "BKPB" in MEIObj.infoDict else "UNK")

    sourceElementId = MEIObj.chrom + ':' + str(MEIObj.pos) + '-' + str(end)
    print "** source element ** ", sourceElementId

    ## Initialize source element dictionary
    alleleCountsDict[sourceElementId] = {}
    alleleCountsDict[sourceElementId]["alleleCount"] = 0
Beispiel #4
0
print "***** ", scriptName, " configuration *****"
print "VCF: ", VCFPaths
print "sampleId: ", sampleId
print "outDir: ", outDir
print
print "***** Executing ", scriptName, ".... *****"
print

## Start ##

#### 1. Create VCF object and read input VCF
header("1. Process input VCFs")
paths = open(VCFPaths, 'r')

# Make merged VCF object
completeVCFObj = formats.VCF()

## Read one VCF per iteration and add the variants to the merged VCF
for VCFfile in paths:

    VCFfile = VCFfile.rstrip('\n\r')
    VCFObj = formats.VCF()

    VCFObj.read_VCF(VCFfile)

    # Add variant objects
    for lineObj in VCFObj.lineList:
        completeVCFObj.addLine(lineObj)

    # Create header
    if completeVCFObj.header == "":
Beispiel #5
0
    targetDonorsList = targetDonorsList + donorIdList

    ## Open output file
    outFilePath = outDir + '/' + ancestryCode + '_donorIdList.tsv'
    outFile = open(outFilePath, 'w')

    ## Write each donorId in the output file. One id per row
    for donorId in donorIdList:
        row = donorId + '\n'
        outFile.write(row)

#### 2. Read input multi-sample VCF and generate a VCF object
###############################################################
header("2. Read input multi-sample VCF and generate a VCF object")

VCFObj = formats.VCF()
VCFObj4Fst = formats.VCF()

VCFObj.read_VCF_multiSample(inputVCF)

#### 3. Select target donors and source elements
##################################################
header("3. Select target donors and source elements")

## target source elements are rare elements with a MAF < 1%
targetSourceList = [
    "1p35.2", "1q23.3", "2q21.3", "3p24.1", "3q26.1", "5q13.1", "7p12.3",
    "7q31.2", "8p23.1f", "9q22.33", "10q25.1", "11p11.2", "11q14.2", "21q21.1"
]

## For each MEI:
Beispiel #6
0
print "***** ", scriptName, " configuration *****"
print "VCF1KGP: ", VCF1KGP
print "VCFPCAWG: ", VCFPCAWG
print "outDir: ", outDir
print
print "***** Executing ", scriptName, ".... *****"
print

## Start ##

#### 1. Read input VCFs and generate VCF objects
#############################################################
header("1. Process input VCFs ")

## 1000 genomes multi-sample VCF
VCFObj1KGP = formats.VCF()
donorIdList1KGP = VCFObj1KGP.read_VCF_multiSample(VCF1KGP)

## PCAWG VCF
VCFObjPCAWG = formats.VCF()
VCFObjPCAWG.read_VCF(VCFPCAWG)

#### 2. Select common MEI from 1000 genomes and PCAWG
############################################################
header("2. Select common MEI from 1000 genomes and PCAWG")

outVCFObj = formats.VCF()

counter = 1

## For each PCAWG MEI
Beispiel #7
0
## Start ##

#### 1. Create database with all MEI events
############################################

## First, make list with all the identified MEI across all the provided samples
allMEIlist = []

with open(VCFs) as VCFs:

    ### Process a VCF in each iteration
    for VCF in VCFs:
        VCF = VCF.rstrip('\n')

        ## 1. Generate VCF object
        VCFObj = formats.VCF()
        VCFObj.read_VCF(VCF)
        VCFheader = VCFObj.header

        ## Select insertions passing all the filters
        for MEIObj in VCFObj.lineList:

            if (MEIObj.filter == "PASS"):
                allMEIlist.append(MEIObj)

## Then organize them into a dictionary
MEIDict = organizeMEI(allMEIlist)

#### 2. Generate a consensus VCF with a non-redundant list of MEI events
##########################################################################
Beispiel #8
0
    ## A) Normal matched VCF not provided or file does not exist
    if (germlineVCF == False):
        msg = "Matched normal VCF not provided"
        log("WARNING", msg)
        germlineMEIDict = False

    ## B) Normal VCF provided but does not exist
    elif not (os.path.isfile(germlineVCF)):
        msg = "Matched normal VCF does not exist"
        log("WARNING", msg)
        germlineMEIDict = False

    ## C) Normal VCF provided -> Organize germline MEI into a dictionary
    else:
        germlineVCFObj = formats.VCF()
        germlineVCFObj.read_VCF(germlineVCF)
        germlineMEIDict = organizeMEI(germlineVCFObj.lineList)

#### 1. Create somatic VCF object and read input VCF
VCFObj = formats.VCF()
VCFObj.read_VCF(inputVCF)

#### 2. Find somatic duplicated insertions
# Duplicated filtering flag provided
if "DUP" in filterList:
    dupList = findDuplicates(VCFObj.lineList)

    print "number_duplicates: ", len(dupList), dupList

#### 3. Organize somatic MEI into a dictionary.
Beispiel #9
0
def call_NUMT(vcf, mtGenome, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/insertions.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for the mitochondrial genome
    fileName = 'mtGenome'
    mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir)

    ## 3. Align inserted sequences against the mitochondrial genome
    PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1,
                                            tmpDir)
    PAF_mt = formats.PAF()
    PAF_mt.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_mt = group_alignments(PAF_mt)

    ## 5. Make NUMTs calls
    NUMTs = {}

    for insId in PAFs_mt:
        chain = PAFs_mt[insId].chain(20, 50)

        # Make NUMT call if enough % of sequence resolved
        if chain.perc_query_covered() >= 60:

            coords = chain.interval_template()

            NUMT = {}
            NUMT['ITYPE'] = 'NUMT'
            NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1])
            NUMTs[insId] = NUMT

    ## 6. Generate output VCF containing NUMT calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'],
                }
    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in NUMTs):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(NUMTs[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF
Beispiel #10
0
def call_MEI(vcf, consensus, reference, sourceDb, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/MEI_candidate.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for consensus sequences
    fileName = 'consensus'
    consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir)

    ## 3. Align inserted sequences against consensus:
    PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex,
                                            'hits2consensus', 1, tmpDir)
    PAF_consensus = formats.PAF()
    PAF_consensus.read(PAF_path)

    ## Temporary
    index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi"
    PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI',
                                            1, tmpDir)

    ## Align inserted sequences against the reference genome
    #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir)
    #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir)
    #PAF_genome = formats.PAF()
    #PAF_genome.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_consensus = group_alignments(PAF_consensus)
    #PAFs_genome = group_alignments(PAF_genome)

    ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences
    structures = {}

    for insId in PAFs_consensus:
        structures[insId] = MEI_structure(PAFs_consensus[insId],
                                          fasta.seqDict[insId])
        seqBeg, seqEnd = structures[insId]['CHAIN'].interval()

    ## 6. Resolve 3' partnered transductions
    structures = resolve_partnered_3prime(structures, fasta, reference,
                                          sourceDb, tmpDir)

    ## 6. Search for 5' partnered transductions
    structures = search4partnered_5prime(structures, fasta, reference, tmpDir)

    ## 7. Search for orphan transductions
    ## Remove resolved insertions
    #for insId in structures:
    #    if structures[insId]['PASS']:
    #        del PAFs_genome[insId]

    ## Do orphan transduction search
    #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..)

    ## 8. Generate output VCF containing MEI calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered,  orphan or NUMT)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \
                'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated']
                }

    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in structures) or ((insId in structures) and
                                         (structures[insId]['PASS'] is False)):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(structures[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF
Beispiel #11
0
print()
print('***** ', scriptName, 'configuration *****')
print('vcf: ', vcf)
print('consensus: ', consensus)
print('reference: ', reference)
print('mtGenome: ', mtGenome)
print('fileName: ', fileName)
print('outDir: ', outDir, "\n")

##########
## MAIN ##
##########
## Note: NUMT detection is disabled

## 1. Read VCF
VCF = formats.VCF()
VCF.read(vcf)

## 2. Load source elements database

annotDir = '/Users/brodriguez/Research/Projects/HGSVC2/Analysis/Source_L1/V2/data/'
annotations = annotation.load_annotations(['TRANSDUCTIONS'],
                                          VCF.header.refLengths, annotDir,
                                          None, 1, outDir)

## 2. Filter VCF by selecting retrotransposition insertion candidates
# (inserted sequences with polyA/T tails at their ends)
candidateVCF, filteredVCF = call_MEI_candidate(VCF)

## 3. Search for NUMTs
#NUMT_VCF = call_NUMT(filteredVCF, mtGenome, outDir)
Beispiel #12
0
print "outDir: ", outDir
print
print "***** Executing ", scriptName, ".... *****"
print


## Start ## 

#### 1. Read input VCFs and generate VCF objects
###################################################
# Important requirement: the two VCF must have the same MEIs sorted also in the same order
header("1. Process input VCFs ")


## Normal genome multi-sample VCF
VCFObjNormal = formats.VCF()
donorIdListNormal = VCFObjNormal.read_VCF_multiSample(VCFnormal)

## Tumor genome multi-sample VCF
VCFObjTumor = formats.VCF()
donorIdListTumor = VCFObjTumor.read_VCF_multiSample(VCFtumor)

#### 2. Identify MEI that are blood/normal specific singletons
###############################################################
# These cases are potential blood somatic events...
header("2. Identify MEI that are blood/normal specific singletons")

# Make VCF object with all the candidate blood somatic MEI
VCFObjBloodSomatic = formats.VCF()

### For each MEI