예제 #1
0
파일: Run_Delly.py 프로젝트: rhshah/iCallSV
def run(
        delly,
        version,
        bcftools,
        analysisType,
        reference,
        controlBam,
        caseBam,
        caseId,
        mapq,
        excludeRegions,
        outputdir,
        verbose,
        debug):
    """
    This will Runs the delly program on case and control bam file to give its
    results.

    :param str delly: Path to delly executables (0.7.3 or above)
    :param str bcftools: Path to bcftools executables (1.3.1 or above)
    :param str type: What ot run in delly, DEL:Deletion, DUP: Duplication,TRA:Translocation, INV:Inversion
    :param str reference: Reference Genome that was used to align the reads.
    :param str controlBam: Path to control/normal bam file
    :param str caseBam: Path to case/tumor bam file
    :param str controlID: Id of the control/normal sample
    :param str caseID: Id of the case/tumor sample
    :param int mapq: mapping quality cutoff for delly
    :param str excludeRegions: Regions to be excluded for calling structural variation.
    :param str outputdir: directory for the output of delly
    :param bool debug: If you just wish to test what we will do
    :return: str of the output vcf
    :rtype: str

    """

    start_time = time.time()
    if(verbose):
        logger.info("Run_Delly: We are now going to run Delly for you. It going to be exciting time.")
    myPid = os.getpid()
    day = date.today()
    today = day.isoformat()
    tag = analysisType.lower()
    outputBcf = outputdir + "/" + caseId + "_" + tag + ".bcf"
    outputVcf = outputdir + "/" + caseId + "_" + tag + ".vcf"
    # Check all input variables
    cp.checkFile(controlBam)
    cp.checkFile(caseBam)
    cp.checkFile(delly)
    cp.checkEmpty(version, "Delly Version")
    cp.checkFile(bcftools)
    cp.checkFile(reference)
    cp.checkFile(excludeRegions)
    cp.checkDir(outputdir)
    cp.checkInt(mapq, "Delly MAPQ")
    cp.checkEmpty(caseId, "Delly Case BAM ID")
    cp.checkDellyAnalysisType(analysisType)
    if(verbose):
        logger.info("Run_Delly: All the input parameters look good for running delly")
        logger.info("Run_Delly: ProcessID:%s,Date:%s", myPid, today)
    if(debug):
        if(version >= StrictVersion('0.7.3')):
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputBcf + " " + caseBam + " " + controlBam
            logger.debug("Run_Delly: Command that will be run %s", cmd)
        else:
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputVcf + " " + caseBam + " " + controlBam
            logger.debug("Run_Delly: Command that will be run %s", cmd)
    else:
        # Check if bam index files are there else make them
        controlBai = controlBam + ".bai"
        if(os.path.isfile(controlBai)):
            if(verbose):
                logger.info("Run_Delly: Bam Index file is present for %s ", controlBai)
        else:
            if(verbose):
                logger.warn(
                    "Run_Delly: Bam Index file is not present and we will make it for %s ",
                    controlBai)
            mbi.MakeIndex(controlBam)
        caseBai = caseBam + ".bai"
        if(os.path.isfile(caseBai)):
            if(verbose):
                logger.info("Run_Delly: Bam Index file is present for %s ", caseBai)
        else:
            if(verbose):
                logger.warn(
                    "Run_Delly: Bam Index file is not present and we will make it for %s ",
                    caseBai)
            mbi.MakeIndex(caseBam)
        if(version >= StrictVersion('0.7.3')):
            cmd = delly + " call -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputBcf + " " + caseBam + " " + controlBam
        else:
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputVcf + " " + caseBam + " " + controlBam
        if(verbose):
            logger.info("Run_Delly: Command that will be run:%s", cmd)
        args = shlex.split(cmd)
        proc = Popen(args)
        proc.wait()
        retcode = proc.returncode
        if(retcode >= 0):
            end_time = time.time()
            totaltime = str(timedelta(seconds=end_time - start_time))
            if(verbose):
                logger.info(
                    "Run_Delly: We have finished running Delly for %s using local machine", caseId)
                logger.info("Run_Delly Duration: %s", totaltime)
            if(version >= StrictVersion('0.7.3')):
                if(os.path.isfile(outputBcf)):
                    cmd = bcftools + " view " + outputBcf + " -O v -o " + outputVcf
                    if(verbose):
                        logger.info("Run_Delly_bcf2vcf: Command that will be run:%s", cmd)
                    args = shlex.split(cmd)
                    proc = Popen(args)
                    proc.wait()
                    retcode = proc.returncode
                    if(retcode >= 0):
                        end_time = time.time()
                        totaltime = str(timedelta(seconds=end_time - start_time))
                        if(verbose):
                            logger.info(
                                "Run_Delly_bcf2vcf: We have finished running bcftools for %s using local machine", caseId)
                            logger.info("Run_Delly_bcf2vcf Duration: %s", totaltime)
                    else:
                        if(verbose):
                            logger.fatal(
                                "Run_Delly_bcf2vcf: bcftools is either still running on local machine or it errored out with return code %d for %s",
                                retcode,
                                caseId)
                        sys.exit(1)
                else:
                    if(verbose):
                        logger.fatal(
                            "Run_Delly_bcf2vcf: bcftools is either still running on local machine or it errored out with return code %d for %s",
                            retcode,
                            caseId)
                        sys.exit(1)
            else:
                if(os.path.isfile(outputVcf)):
                    return(outputVcf)
                else:
                    if(verbose):
                        logger.fatal(
                            "Run_Delly: Delly is either still running on local machine or it errored out with return code %d for %s",
                            retcode,
                            caseId)
                    sys.exit(1)
        else:
            if(verbose):
                logger.fatal(
                    "Run_Delly: Delly is either still running on local machine or it errored out with return code %d for %s",
                    retcode,
                    caseId)
            sys.exit(1)
    return(outputVcf)
예제 #2
0
def run(
        RLocation,
        targetSeqView,
        nodes,
        bamFilePath,
        svFile,
        build,
        readLength,
        outputDir,
        outsvFileName):
    """
    This module will run targetSeqView.

    :param str RLocation: Location of the R executable (>3.1.2).
    :param str targetSeqView: Location of R script that will run tragetSeqView
    :param int nodes: Number of parallel nodes for running targetSeqView
    :param str bamFile: Location of the bamFile which has the  structural variant events.
    :param str svFile: targetSeqView compatible input structural variant file.
    :param str build: Which human reference file to be used, hg18,hg19 or hg38
    :param int readLength: Sequencing Read Length (101)
    :param str outputDir: Directory for output files
    :param str outsvFile: Name of the output structural variant file that has added confidence score to it.
    :return: str of the output file
    :rtype: str

    """
    start_time = time.time()
    logger.info(
        "We will now be running targetSeqView. Hope fully the R package targetSeqView is installed.")
    cp.checkFile(targetSeqView)
    cp.checkInt(nodes, "Number of nodes to run targetSeqView")
    cp.checkDir(bamFilePath)
    cp.checkFile(svFile)
    cp.checkEmpty(build, "Genome build to be used for targetSeqView")
    cp.checkInt(readLength, "Sequencing Read Length")
    cp.checkDir(outputDir)
    logger.info("All Input Parameters look good. Lets Run targetSeqView")
    RLocation = RLocation + "/bin/R"
    myPid = os.getpid()
    day = date.today()
    today = day.isoformat()
    logger.info("Run_targetSeqView: ProcessID: %s, Date: %s", myPid, today)
    outputFile = outputDir + "/" + outsvFileName
    stdoutFile = outputDir + "/" + outsvFileName[:-4] + "_" + str(myPid) + ".stdout"
    stderrFile = outputDir + "/" + outsvFileName[:-4] + "_" + str(myPid) + ".stderr"
    cmd = RLocation + " --slave --vanilla --args " + str(nodes) + " " + bamFilePath + " " + svFile + " " + build + " " + str(
        readLength) + " " + outputDir + " " + outsvFileName + " < " + targetSeqView + " > " + stdoutFile + " 2> " + stderrFile
    logger.info("Run_targetSeqView: Command that will be run %s", cmd)
    # Remove if the file exists
    if(os.path.isfile(outputFile)):
        os.remove(outputFile)
    proc = Popen(cmd, shell=True)
    proc.wait()
    retcode = proc.returncode
    if(retcode >= 0):
        end_time = time.time()
        totaltime = str(timedelta(seconds=end_time - start_time))
        logger.info(
            "Run_targetSeqView: We have finished running targetSeqView for %s using local machine.",
            svFile)
        logger.info("Run_targetSeqView Duration: %s", totaltime)
    else:
        logger.info(
            "Run_targetSeqView: targetSeqView is either still running on local machine or it errored out with return code %d for %s",
            retcode,
            svFile)
        sys.exit()
    return(outputFile)
예제 #3
0
def run(
        python,
        iAnnotateSV,
        build,
        distance,
        canonicalTranscriptFile,
        uniprotFile,
        cosmicFile,
        cosmicCountsFile,
        repeatregionFile,
        dgvFile,
        inputTabFile,
        outputPrefix,
        outputDir):
    """
    This module will run iAnnotateSV package.

    :param str python: Location for the python executable.
    :param str iAnnotateSV: Location of the wrapper iAnnotateSV package (iAnnotateSV.py)
    :param str build: Which human reference file to be used, hg18,hg19 or hg38
    :param str inputTabFile: Tab-Delimited Input FIle compatible with iAnnotateSV package.
    :param str outputPrefix: Prefix of the output files/DIR with Annotations and images
    :param str outputDir: Name of the output directory where the outputPrefix will be written
    :param str uniprotFile: Location for ucsc uniprot file
    :param str cosmicFile: Location for cosmic census file
    :param str cosmicCountsFile: Location for cosmic fusion counts file
    :param str repeatregionFile: Location for repeat region file
    :param str dgvFile: Location for database of Genomic Variants file
    :return: str of the output file
    :rtype: str

    """

    start_time = time.time()
    cp.checkDir(outputDir)
    cp.checkFile(iAnnotateSV)
    cp.checkFile(inputTabFile)
    cp.checkFile(python)
    cp.checkInt(distance, "Distance for extending the promoter region")
    cp.checkEmpty(build, "Which human reference file to be used, hg18,hg19 or hg38")
    cp.checkFile(canonicalTranscriptFile)
    cp.checkFile(uniprotFile)
    cp.checkFile(cosmicFile)
    cp.checkFile(cosmicCountsFile)
    cp.checkFile(repeatregionFile)
    cp.checkFile(dgvFile)
    logger.info("Run_iAnnotateSV: All input parameters look good. Lets run the package.")
    myPid = os.getpid()
    day = date.today()
    today = day.isoformat()
    logger.info("Run_iAnnotateSV: ProcessID:%s, Date:%s", myPid, today)
    outputFile = outputDir + "/" + outputPrefix + "_Annotated.txt"
    cmd = python + " " + iAnnotateSV + " -r " + build + " -i " + inputTabFile + " -o " + outputDir + " -ofp " + outputPrefix + " -d " + str(
        distance) + " -c " + canonicalTranscriptFile + " -rr " + repeatregionFile + " -cc " + cosmicFile + " -cct " + cosmicCountsFile + " -dgv " + dgvFile + " -v -p -u " + uniprotFile
    args = shlex.split(cmd)
    logger.info("Run_iAnnotateSV: Command that will be run: %s", cmd)
    # Remove if the file exists
    if(os.path.isfile(outputFile)):
        os.remove(outputFile)
    proc = Popen(args)
    proc.wait()
    retcode = proc.returncode
    if(retcode >= 0):
        end_time = time.time()
        totaltime = str(timedelta(seconds=end_time - start_time))
        logger.info(
            "Run_iAnnotateSV: We have finished running iAnnotateSV for %s using local machine",
            inputTabFile)
        logger.info("Run_iAnnotateSV Duration: %s", totaltime)
    else:
        logger.info(
            "Run_iAnnotateSV: iAnnotateSV is either still running on local machine or it errored out with return code %d for %s",
            retcode,
            inputTabFile)
        sys.exit()
    return(outputFile)
예제 #4
0
def run(
        inputVcf,
        outputDir,
        controlId,
        caseID,
        hotspotFile,
        blacklistFile,
        svlength,
        mapq,
        mapqHotspot,
        caseAltFreqHotspot,
        caseTotalCountHotspot,
        controlAltFreqHotspot,
        caseAltFreq,
        caseTotalCount,
        controlAltFreq,
        peSupport,
        srSupport,
        peSupportHotspot,
        srSupportHotspot,
        peSupportCase,
        srSupportCase,
        peSupportHotspotCase,
        srSupportHotspotCase,
        peSupportControl,
        srSupportControl,
        peSupportHotspotControl,
        srSupportHotspotControl,
        verbose):
    """``main:``Filter calls made by Delly which are in a VCF format


    :param str inputVcf: Input VCF file name with path
    :param str outputDir: Output directory
    :param str controlId: Control Sample ID (Should be part of Sample Name in VCF)
    :param str caseID: Case Sample ID (Should be part of Sample Name in VCF)
    :param str hospotFile: List of Genes that have Hotspot Structural Variants (Tab-delimited Format without header:chr    start    end    geneName).
    :param str blacklistFile: List of Genes that have blacklist of Structural Variants (Tab-delimited Format without header:chr    start1    chr2     start2; where chr1==chr2, end==start2).
    :param float caseAltFreq: Alternate Allele Frequency threshold for case
    :param int caseTotalCount: Total ReadCount threshold for case
    :param flaot ccontrolAltFreq: Alternate Allele Frequency threshold for control
    :param int peSupport: overall pair-end read support threshold for the event
    :param int srSupport: overall split-reads support threshold for the event
    :param int peSupportHotspot: overall pair-end read support threshold for the event in hot-spot region
    :param int srSupportHotspot: overall split-reads support threshold for the event in hot-spot region
    :param int peSupportCase: pair-end read support threshold for the event in the Case sample
    :param int srSupportCase: split-reads support threshold for the event in the Case sample
    :param int peSupportHotspotCase: pair-end read support threshold for the event in hot-spot region for the Case sample
    :param int srSupportHotspotCase: split-reads support threshold for the event in hot-spot region for the Case sample
    :param int peSupportControl: pair-end read support threshold for the event in the Control sample
    :param int srSupportControl: split-reads support threshold for the event in the Control sample
    :param int peSupportHotspotControl: pair-end read support threshold for the event in hot-spot region for the Control sample
    :param int srSupportHotspotControl: split-reads support threshold for the event in hot-spot region for the Control sample
    :param int svlength: length of the structural variants
    :param int mapq: overall mapping quality
    :param int mapqHotspot: mapping quality for hot-spots
    :return: A str name of filtered vcf file
    :rtype: str
    """
    if(verbose):
        logger.info("FilterDellyCalls: We will now check all the input parameters")
    # Check input parameters
    cp.checkDir(outputDir)
    cp.checkFile(hotspotFile)
    cp.checkFile(blacklistFile)
    cp.checkEmpty(controlId, "Control Bam ID")
    cp.checkEmpty(caseID, "Case Bam ID")
    cp.checkInt(svlength, "Length of Structural Variant Threshold")
    cp.checkInt(mapq, "Mapping quality of Reads threshold")
    cp.checkInt(mapqHotspot, "Mapping quality of Reads threshold for hotspot events ")
    cp.checkInt(peSupport, "overall pair-end read support threshold for the event")
    cp.checkInt(srSupport, "overall split-reads support threshold for the event")
    cp.checkInt(
        peSupportHotspot,
        "overall pair-end read support threshold for the event in hot-spot region")
    cp.checkInt(
        srSupportHotspot,
        "overall split-reads support threshold for the event in hot-spot region")
    cp.checkInt(
        peSupportCase,
        "overall pair-end read support threshold for the event for the Case sample")
    cp.checkInt(
        srSupportCase,
        "overall split-reads support threshold for the event for the Case sample")
    cp.checkInt(
        peSupportHotspotCase,
        "overall pair-end read support threshold for the event in hot-spot region for the Case sample")
    cp.checkInt(
        srSupportHotspotCase,
        "overall split-reads support threshold for the event in hot-spot region for the Case sample")
    cp.checkInt(
        peSupportControl,
        "overall pair-end read support threshold for the event for the Control sample")
    cp.checkInt(
        srSupportControl,
        "overall split-reads support threshold for the event for the Control sample")
    cp.checkInt(
        peSupportHotspotControl,
        "overall pair-end read support threshold for the event in hot-spot region for the Control sample")
    cp.checkInt(
        srSupportHotspotControl,
        "overall split-reads support threshold for the event in hot-spot region for the Control sample")
    if(verbose):
        logger.info("FilterDellyCalls: All Input Parameters look good for filtering these VCF file.")
        logger.info("FilterDellyCalls: We will filter the given VCF file now.")
    # Make a string of all the variables
    thresholdVariablesList = [svlength,
                              mapq,
                              mapqHotspot,
                              caseAltFreqHotspot,
                              caseTotalCountHotspot,
                              controlAltFreqHotspot,
                              caseAltFreq,
                              caseTotalCount,
                              controlAltFreq,
                              peSupport,
                              srSupport,
                              peSupportHotspot,
                              srSupportHotspot,
                              peSupportCase,
                              srSupportCase,
                              peSupportHotspotCase,
                              srSupportHotspotCase,
                              peSupportControl,
                              srSupportControl,
                              peSupportHotspotControl,
                              srSupportHotspotControl]
    thresholdVariables = ",".join(str(v) for v in thresholdVariablesList)

    hotspotDict = chl.ReadHotSpotFile(hotspotFile)
    blacklist = cbl.ReadBlackListFile(blacklistFile)
    outputVcf = os.path.splitext(os.path.basename(inputVcf))[0] + "_filtered.vcf"
    outputFile = os.path.join(outputDir, outputVcf)
    if(not os.path.isfile(inputVcf)):
        if(verbose):
            logger.warning("VCF file %s does not exists.", inputVcf)
        return(outputFile)
    vcf_reader = vcf.Reader(open(inputVcf, 'r'))
    vcf_writer = vcf.Writer(open(outputFile, 'w'), vcf_reader)
    samples = vcf_reader.samples
    pattern = re.compile(caseID)
    # Get the case and control id
    caseIDinVcf = None
    controlIDinVcf = None
    for sample in samples:
        match = re.search(pattern, sample)
        if(match):
            caseIDinVcf = sample
        else:
            controlIDinVcf = sample
    # Check if ID are assigned properly or not
    if(caseIDinVcf is None):
        logger.error("FilterDellyCalls: caseID was not assigned properly, please make sure that the vcf case id and the provided case id match")
        sys.exit(1)
    else:
        if(verbose):
            logger.info("FilterDellyCalls:Case ID is: %s file", caseIDinVcf)
    if(controlIDinVcf is None):
        logger.error("FilterDellyCalls: controlID was not assigned properly, please make sure that the vcf control id and the provided control id match")
        sys.exit(1)
    else:
        if(verbose):
            logger.info("FilterDellyCalls:Control ID is: %s file", controlIDinVcf)
    
    # Traversing the VCF
    for record in vcf_reader:
        # Define all variables:
        (chrom1,
         start1,
         start2,
         chrom2,
         filter,
         svlengthFromDelly,
         mapqFromDelly,
         svtype,
         preciseFlag,
         peSupportFromDelly,
         srSupportFromDelly,
         contype,
         caseDR,
         caseDV,
         caseRR,
         caseRV,
         caseFT,
         controlDR,
         controlDV,
         controlRR,
         controlRV,
         controlFT) = (None for i in range(22))
        chrom1 = record.CHROM
        start1 = record.POS
        filter = record.FILTER
        if(len(filter) < 1):
            filter = "PASS"
        else:
            filter = filter[0]
        preciseFlag = record.is_sv_precise
        # print "Precise:",preciseFlag,":",type(preciseFlag)
        if("END" in record.INFO):
            start2 = record.INFO['END']
        if("CHR2" in record.INFO):
            chrom2 = record.INFO['CHR2']
        if("SVTYPE" in record.INFO):
            svtype = record.INFO['SVTYPE']
        if("SVLEN" in record.INFO):
            svlengthFromDelly = record.INFO['SVLEN']
        else:
            if(svtype == "TRA"):
                svlengthFromDelly = None
            else:
                svlengthFromDelly = abs(start2 - start1)
        if("MAPQ" in record.INFO):
            mapqFromDelly = record.INFO['MAPQ']
        if("PE" in record.INFO):
            peSupportFromDelly = record.INFO['PE']
        if("SR" in record.INFO):
            srSupportFromDelly = record.INFO['SR']
        if("CT" in record.INFO):
            contype = record.INFO['CT']
        caseCalls = record.genotype(caseIDinVcf)
        controlCalls = record.genotype(controlIDinVcf)
        if(hasattr(caseCalls.data, "FT")):
            caseFT = caseCalls.data.FT
        if(hasattr(caseCalls.data, "DR")):
            caseDR = caseCalls.data.DR
        if(hasattr(caseCalls.data, "DV")):
            caseDV = caseCalls.data.DV
        if(hasattr(caseCalls.data, "RR")):
            caseRR = caseCalls.data.RR
        if(hasattr(caseCalls.data, "RV")):
            caseRV = caseCalls.data.RV

        if(hasattr(controlCalls.data, "FT")):
            controlFT = controlCalls.data.FT
        if(hasattr(controlCalls.data, "DR")):
            controlDR = controlCalls.data.DR
        if(hasattr(controlCalls.data, "DV")):
            controlDV = controlCalls.data.DV
        if(hasattr(controlCalls.data, "RR")):
            controlRR = controlCalls.data.RR
        if(hasattr(controlCalls.data, "RV")):
            controlRV = controlCalls.data.RV
        # Make a string of all the variables
        dellyVariablesList = [chrom1,
                              start1,
                              start2,
                              chrom2,
                              filter,
                              svlengthFromDelly,
                              mapqFromDelly,
                              svtype,
                              preciseFlag,
                              peSupportFromDelly,
                              srSupportFromDelly,
                              contype,
                              caseFT,
                              caseDR,
                              caseDV,
                              caseRR,
                              caseRV,
                              controlFT,
                              controlDR,
                              controlDV,
                              controlRR,
                              controlRV]
        dellyVariables = ",".join(str(v) for v in dellyVariablesList)
        # print chrom1, start1, start2, chrom2, "Coordinate"
        # print svlengthFromDelly, mapqFromDelly, svtype, peSupportFromDelly, srSupportFromDelly, contype, "Overall"
        # print caseDR, caseDV, caseRR, caseRV, "Case"
        # print controlDR, controlDV, controlRR, controlRV, "Control"
        filterFlag = GetFilteredRecords(dellyVariables, thresholdVariables, hotspotDict, blacklist)
        if(filterFlag):
            # print "Passs"
            vcf_writer.write_record(record)
    vcf_writer.close()
    if(verbose):
        logger.info("FilterDellyCalls: We have finished filtering: %s file", inputVcf)
        logger.info("FilterFellyCalls: Output hass been written in: %s file", outputFile)
    return(outputFile)
예제 #5
0
파일: Run_Delly.py 프로젝트: semir2/iCallSV
def run(delly, version, bcftools, analysisType, reference, controlBam, caseBam,
        caseId, mapq, excludeRegions, outputdir, verbose, debug):
    """
    This will Runs the delly program on case and control bam file to give its
    results.

    :param str delly: Path to delly executables (0.7.3 or above)
    :param str bcftools: Path to bcftools executables (1.3.1 or above)
    :param str type: What ot run in delly, DEL:Deletion, DUP: Duplication,TRA:Translocation, INV:Inversion
    :param str reference: Reference Genome that was used to align the reads.
    :param str controlBam: Path to control/normal bam file
    :param str caseBam: Path to case/tumor bam file
    :param str controlID: Id of the control/normal sample
    :param str caseID: Id of the case/tumor sample
    :param int mapq: mapping quality cutoff for delly
    :param str excludeRegions: Regions to be excluded for calling structural variation.
    :param str outputdir: directory for the output of delly
    :param bool debug: If you just wish to test what we will do
    :return: str of the output vcf
    :rtype: str

    """

    start_time = time.time()
    if (verbose):
        logger.info(
            "Run_Delly: We are now going to run Delly for you. It going to be exciting time."
        )
    myPid = os.getpid()
    day = date.today()
    today = day.isoformat()
    tag = analysisType.lower()
    outputBcf = outputdir + "/" + caseId + "_" + tag + ".bcf"
    outputVcf = outputdir + "/" + caseId + "_" + tag + ".vcf"
    # Check all input variables
    cp.checkFile(controlBam)
    cp.checkFile(caseBam)
    cp.checkFile(delly)
    cp.checkEmpty(version, "Delly Version")
    cp.checkFile(bcftools)
    cp.checkFile(reference)
    cp.checkFile(excludeRegions)
    cp.checkDir(outputdir)
    cp.checkInt(mapq, "Delly MAPQ")
    cp.checkEmpty(caseId, "Delly Case BAM ID")
    cp.checkDellyAnalysisType(analysisType)
    if (verbose):
        logger.info(
            "Run_Delly: All the input parameters look good for running delly")
        logger.info("Run_Delly: ProcessID:%s,Date:%s", myPid, today)
    if (debug):
        if (version >= StrictVersion('0.7.3')):
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputBcf + " " + caseBam + " " + controlBam
            logger.debug("Run_Delly: Command that will be run %s", cmd)
        else:
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputVcf + " " + caseBam + " " + controlBam
            logger.debug("Run_Delly: Command that will be run %s", cmd)
    else:
        # Check if bam index files are there else make them
        controlBai = controlBam + ".bai"
        if (os.path.isfile(controlBai)):
            if (verbose):
                logger.info("Run_Delly: Bam Index file is present for %s ",
                            controlBai)
        else:
            if (verbose):
                logger.warn(
                    "Run_Delly: Bam Index file is not present and we will make it for %s ",
                    controlBai)
            mbi.MakeIndex(controlBam)
        caseBai = caseBam + ".bai"
        if (os.path.isfile(caseBai)):
            if (verbose):
                logger.info("Run_Delly: Bam Index file is present for %s ",
                            caseBai)
        else:
            if (verbose):
                logger.warn(
                    "Run_Delly: Bam Index file is not present and we will make it for %s ",
                    caseBai)
            mbi.MakeIndex(caseBam)
        if (version >= StrictVersion('0.7.3')):
            cmd = delly + " call -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputBcf + " " + caseBam + " " + controlBam
        else:
            cmd = delly + " -t " + analysisType + " -g " + reference + " -x " + excludeRegions + \
                " -q " + str(mapq) + " -o " + outputVcf + " " + caseBam + " " + controlBam
        if (verbose):
            logger.info("Run_Delly: Command that will be run:%s", cmd)
        args = shlex.split(cmd)
        proc = Popen(args)
        proc.wait()
        retcode = proc.returncode
        if (retcode >= 0):
            end_time = time.time()
            totaltime = str(timedelta(seconds=end_time - start_time))
            if (verbose):
                logger.info(
                    "Run_Delly: We have finished running Delly for %s using local machine",
                    caseId)
                logger.info("Run_Delly Duration: %s", totaltime)
            if (version >= StrictVersion('0.7.3')):
                if (os.path.isfile(outputBcf)):
                    cmd = bcftools + " view " + outputBcf + " -O v -o " + outputVcf
                    if (verbose):
                        logger.info(
                            "Run_Delly_bcf2vcf: Command that will be run:%s",
                            cmd)
                    args = shlex.split(cmd)
                    proc = Popen(args)
                    proc.wait()
                    retcode = proc.returncode
                    if (retcode >= 0):
                        end_time = time.time()
                        totaltime = str(
                            timedelta(seconds=end_time - start_time))
                        if (verbose):
                            logger.info(
                                "Run_Delly_bcf2vcf: We have finished running bcftools for %s using local machine",
                                caseId)
                            logger.info("Run_Delly_bcf2vcf Duration: %s",
                                        totaltime)
                    else:
                        if (verbose):
                            logger.fatal(
                                "Run_Delly_bcf2vcf: bcftools is either still running on local machine or it errored out with return code %d for %s",
                                retcode, caseId)
                        sys.exit(1)
                else:
                    if (verbose):
                        logger.fatal(
                            "Run_Delly_bcf2vcf: bcf file was not generated the return code is %d for %s",
                            retcode, caseId)
                        #sys.exit(1)
            else:
                if (os.path.isfile(outputVcf)):
                    return (outputVcf)
                else:
                    if (verbose):
                        logger.fatal(
                            "Run_Delly: Delly is either still running on local machine or it errored out with return code %d for %s",
                            retcode, caseId)
                    sys.exit(1)
        else:
            if (verbose):
                logger.fatal(
                    "Run_Delly: Delly is either still running on local machine or it errored out with return code %d for %s",
                    retcode, caseId)
            sys.exit(1)
    return (outputVcf)
예제 #6
0
def run(
        python,
        iAnnotateSV,
        build,
        distance,
        canonicalTranscriptFile,
        uniprotFile,
        cosmicFile,
        cosmicCountsFile,
        repeatregionFile,
        dgvFile,
        inputTabFile,
        outputPrefix,
        outputDir):
    """
    This module will run iAnnotateSV package.

    :param str python: Location for the python executable.
    :param str iAnnotateSV: Location of the wrapper iAnnotateSV package (iAnnotateSV.py)
    :param str build: Which human reference file to be used, hg18,hg19 or hg38
    :param str inputTabFile: Tab-Delimited Input FIle compatible with iAnnotateSV package.
    :param str outputPrefix: Prefix of the output files/DIR with Annotations and images
    :param str outputDir: Name of the output directory where the outputPrefix will be written
    :param str uniprotFile: Location for ucsc uniprot file
    :param str cosmicFile: Location for cosmic census file
    :param str cosmicCountsFile: Location for cosmic fusion counts file
    :param str repeatregionFile: Location for repeat region file
    :param str dgvFile: Location for database of Genomic Variants file
    :return: str of the output file
    :rtype: str

    """

    start_time = time.time()
    cp.checkDir(outputDir)
    cp.checkFile(iAnnotateSV)
    cp.checkFile(inputTabFile)
    cp.checkFile(python)
    cp.checkInt(distance, "Distance for extending the promoter region")
    cp.checkEmpty(build, "Which human reference file to be used, hg18,hg19 or hg38")
    cp.checkFile(canonicalTranscriptFile)
    cp.checkFile(uniprotFile)
    cp.checkFile(cosmicFile)
    cp.checkFile(cosmicCountsFile)
    cp.checkFile(repeatregionFile)
    cp.checkFile(dgvFile)
    logger.info("Run_iAnnotateSV: All input parameters look good. Lets run the package.")
    myPid = os.getpid()
    day = date.today()
    today = day.isoformat()
    logger.info("Run_iAnnotateSV: ProcessID:%s, Date:%s", myPid, today)
    outputFile = outputDir + "/" + outputPrefix + "_Annotated.txt"
    cmd = python + " " + iAnnotateSV + " -r " + build + " -i " + inputTabFile + " -o " + outputDir + " -ofp " + outputPrefix + " -d " + str(
        distance) + " -c " + canonicalTranscriptFile + " -rr " + repeatregionFile + " -cc " + cosmicFile + " -cct " + cosmicCountsFile + " -dgv " + dgvFile + " -v -p -u " + uniprotFile
    args = shlex.split(cmd)
    logger.info("Run_iAnnotateSV: Command that will be run: %s", cmd)
    # Remove if the file exists
    if(os.path.isfile(outputFile)):
        os.remove(outputFile)
    proc = Popen(args)
    proc.wait()
    retcode = proc.returncode
    if(retcode >= 0):
        end_time = time.time()
        totaltime = str(timedelta(seconds=end_time - start_time))
        logger.info(
            "Run_iAnnotateSV: We have finished running iAnnotateSV for %s using local machine",
            inputTabFile)
        logger.info("Run_iAnnotateSV Duration: %s", totaltime)
    else:
        logger.info(
            "Run_iAnnotateSV: iAnnotateSV is either still running on local machine or it errored out with return code %d for %s",
            retcode,
            inputTabFile)
        sys.exit()
    return(outputFile)
예제 #7
0
def run(
    inputVcf,
    outputDir,
    controlId,
    caseID,
    hotspotFile,
    blacklistFile,
    svlength,
    mapq,
    mapqHotspot,
    caseAltFreqHotspot,
    caseTotalCountHotspot,
    controlAltFreqHotspot,
    caseAltFreq,
    caseTotalCount,
    controlAltFreq,
    peSupport,
    srSupport,
    peSupportHotspot,
    srSupportHotspot,
    peSupportCase,
    srSupportCase,
    peSupportHotspotCase,
    srSupportHotspotCase,
    peSupportControl,
    srSupportControl,
    peSupportHotspotControl,
    srSupportHotspotControl,
    verbose,
):
    """``main:``Filter calls made by Delly which are in a VCF format


    :param str inputVcf: Input VCF file name with path
    :param str outputDir: Output directory
    :param str controlId: Control Sample ID (Should be part of Sample Name in VCF)
    :param str caseID: Case Sample ID (Should be part of Sample Name in VCF)
    :param str hospotFile: List of Genes that have Hotspot Structural Variants (Tab-delimited Format without header:chr    start    end    geneName).
    :param str blacklistFile: List of Genes that have blacklist of Structural Variants (Tab-delimited Format without header:chr    start1    chr2     start2; where chr1==chr2, end==start2).
    :param float caseAltFreq: Alternate Allele Frequency threshold for case
    :param int caseTotalCount: Total ReadCount threshold for case
    :param flaot ccontrolAltFreq: Alternate Allele Frequency threshold for control
    :param int peSupport: overall pair-end read support threshold for the event
    :param int srSupport: overall split-reads support threshold for the event
    :param int peSupportHotspot: overall pair-end read support threshold for the event in hot-spot region
    :param int srSupportHotspot: overall split-reads support threshold for the event in hot-spot region
    :param int peSupportCase: pair-end read support threshold for the event in the Case sample
    :param int srSupportCase: split-reads support threshold for the event in the Case sample
    :param int peSupportHotspotCase: pair-end read support threshold for the event in hot-spot region for the Case sample
    :param int srSupportHotspotCase: split-reads support threshold for the event in hot-spot region for the Case sample
    :param int peSupportControl: pair-end read support threshold for the event in the Control sample
    :param int srSupportControl: split-reads support threshold for the event in the Control sample
    :param int peSupportHotspotControl: pair-end read support threshold for the event in hot-spot region for the Control sample
    :param int srSupportHotspotControl: split-reads support threshold for the event in hot-spot region for the Control sample
    :param int svlength: length of the structural variants
    :param int mapq: overall mapping quality
    :param int mapqHotspot: mapping quality for hot-spots
    :return: A str name of filtered vcf file
    :rtype: str
    """
    if verbose:
        logger.info("FilterDellyCalls: We will now check all the input parameters")
    # Check input parameters
    cp.checkDir(outputDir)
    cp.checkFile(inputVcf)
    cp.checkFile(hotspotFile)
    cp.checkFile(blacklistFile)
    cp.checkEmpty(controlId, "Control Bam ID")
    cp.checkEmpty(caseID, "Case Bam ID")
    cp.checkInt(svlength, "Length of Structural Variant Threshold")
    cp.checkInt(mapq, "Mapping quality of Reads threshold")
    cp.checkInt(mapqHotspot, "Mapping quality of Reads threshold for hotspot events ")
    cp.checkInt(peSupport, "overall pair-end read support threshold for the event")
    cp.checkInt(srSupport, "overall split-reads support threshold for the event")
    cp.checkInt(peSupportHotspot, "overall pair-end read support threshold for the event in hot-spot region")
    cp.checkInt(srSupportHotspot, "overall split-reads support threshold for the event in hot-spot region")
    cp.checkInt(peSupportCase, "overall pair-end read support threshold for the event for the Case sample")
    cp.checkInt(srSupportCase, "overall split-reads support threshold for the event for the Case sample")
    cp.checkInt(
        peSupportHotspotCase,
        "overall pair-end read support threshold for the event in hot-spot region for the Case sample",
    )
    cp.checkInt(
        srSupportHotspotCase,
        "overall split-reads support threshold for the event in hot-spot region for the Case sample",
    )
    cp.checkInt(peSupportControl, "overall pair-end read support threshold for the event for the Control sample")
    cp.checkInt(srSupportControl, "overall split-reads support threshold for the event for the Control sample")
    cp.checkInt(
        peSupportHotspotControl,
        "overall pair-end read support threshold for the event in hot-spot region for the Control sample",
    )
    cp.checkInt(
        srSupportHotspotControl,
        "overall split-reads support threshold for the event in hot-spot region for the Control sample",
    )
    if verbose:
        logger.info("FilterDellyCalls: All Input Parameters look good for filtering these VCF file.")
        logger.info("FilterDellyCalls: We will filter the given VCF file now.")
    # Make a string of all the variables
    thresholdVariablesList = [
        svlength,
        mapq,
        mapqHotspot,
        caseAltFreqHotspot,
        caseTotalCountHotspot,
        controlAltFreqHotspot,
        caseAltFreq,
        caseTotalCount,
        controlAltFreq,
        peSupport,
        srSupport,
        peSupportHotspot,
        srSupportHotspot,
        peSupportCase,
        srSupportCase,
        peSupportHotspotCase,
        srSupportHotspotCase,
        peSupportControl,
        srSupportControl,
        peSupportHotspotControl,
        srSupportHotspotControl,
    ]
    thresholdVariables = ",".join(str(v) for v in thresholdVariablesList)

    hotspotDict = chl.ReadHotSpotFile(hotspotFile)
    blacklist = cbl.ReadBlackListFile(blacklistFile)
    outputVcf = os.path.splitext(os.path.basename(inputVcf))[0] + "_filtered.vcf"
    vcf_reader = vcf.Reader(open(inputVcf, "r"))
    outputFile = os.path.join(outputDir, outputVcf)
    vcf_writer = vcf.Writer(open(outputFile, "w"), vcf_reader)
    samples = vcf_reader.samples
    pattern = re.compile(caseID)
    # Get the case and control id
    caseIDinVcf = None
    controlIDinVcf = None
    for sample in samples:
        match = re.search(pattern, sample)
        if match:
            caseIDinVcf = sample
        else:
            controlIDinVcf = sample
    # Check if ID are assigned properly or not
    if caseIDinVcf is None:
        logger.error(
            "FilterDellyCalls: caseID was not assigned properly, please make sure that the vcf case id and the provided case id match"
        )
        sys.exit(1)
    else:
        if verbose:
            logger.info("FilterDellyCalls:Case ID is: %s file", caseIDinVcf)
    if controlIDinVcf is None:
        logger.error(
            "FilterDellyCalls: controlID was not assigned properly, please make sure that the vcf control id and the provided control id match"
        )
        sys.exit(1)
    else:
        if verbose:
            logger.info("FilterDellyCalls:Control ID is: %s file", controlIDinVcf)

    # Traversing the VCF
    for record in vcf_reader:
        # Define all variables:
        (
            chrom1,
            start1,
            start2,
            chrom2,
            filter,
            svlengthFromDelly,
            mapqFromDelly,
            svtype,
            preciseFlag,
            peSupportFromDelly,
            srSupportFromDelly,
            contype,
            caseDR,
            caseDV,
            caseRR,
            caseRV,
            caseFT,
            controlDR,
            controlDV,
            controlRR,
            controlRV,
            controlFT,
        ) = (None for i in range(22))
        chrom1 = record.CHROM
        start1 = record.POS
        filter = record.FILTER
        if len(filter) < 1:
            filter = "PASS"
        else:
            filter = filter[0]
        preciseFlag = record.is_sv_precise
        # print "Precise:",preciseFlag,":",type(preciseFlag)
        if "END" in record.INFO:
            start2 = record.INFO["END"]
        if "CHR2" in record.INFO:
            chrom2 = record.INFO["CHR2"]
        if "SVTYPE" in record.INFO:
            svtype = record.INFO["SVTYPE"]
        if "SVLEN" in record.INFO:
            svlengthFromDelly = record.INFO["SVLEN"]
        else:
            if svtype == "TRA":
                svlengthFromDelly = None
            else:
                svlengthFromDelly = abs(start2 - start1)
        if "MAPQ" in record.INFO:
            mapqFromDelly = record.INFO["MAPQ"]
        if "PE" in record.INFO:
            peSupportFromDelly = record.INFO["PE"]
        if "SR" in record.INFO:
            srSupportFromDelly = record.INFO["SR"]
        if "CT" in record.INFO:
            contype = record.INFO["CT"]
        caseCalls = record.genotype(caseIDinVcf)
        controlCalls = record.genotype(controlIDinVcf)
        if hasattr(caseCalls.data, "FT"):
            caseFT = caseCalls.data.FT
        if hasattr(caseCalls.data, "DR"):
            caseDR = caseCalls.data.DR
        if hasattr(caseCalls.data, "DV"):
            caseDV = caseCalls.data.DV
        if hasattr(caseCalls.data, "RR"):
            caseRR = caseCalls.data.RR
        if hasattr(caseCalls.data, "RV"):
            caseRV = caseCalls.data.RV

        if hasattr(controlCalls.data, "FT"):
            controlFT = controlCalls.data.FT
        if hasattr(controlCalls.data, "DR"):
            controlDR = controlCalls.data.DR
        if hasattr(controlCalls.data, "DV"):
            controlDV = controlCalls.data.DV
        if hasattr(controlCalls.data, "RR"):
            controlRR = controlCalls.data.RR
        if hasattr(controlCalls.data, "RV"):
            controlRV = controlCalls.data.RV
        # Make a string of all the variables
        dellyVariablesList = [
            chrom1,
            start1,
            start2,
            chrom2,
            filter,
            svlengthFromDelly,
            mapqFromDelly,
            svtype,
            preciseFlag,
            peSupportFromDelly,
            srSupportFromDelly,
            contype,
            caseFT,
            caseDR,
            caseDV,
            caseRR,
            caseRV,
            controlFT,
            controlDR,
            controlDV,
            controlRR,
            controlRV,
        ]
        dellyVariables = ",".join(str(v) for v in dellyVariablesList)
        # print chrom1, start1, start2, chrom2, "Coordinate"
        # print svlengthFromDelly, mapqFromDelly, svtype, peSupportFromDelly, srSupportFromDelly, contype, "Overall"
        # print caseDR, caseDV, caseRR, caseRV, "Case"
        # print controlDR, controlDV, controlRR, controlRV, "Control"
        filterFlag = GetFilteredRecords(dellyVariables, thresholdVariables, hotspotDict, blacklist)
        if filterFlag:
            # print "Passs"
            vcf_writer.write_record(record)
    vcf_writer.close()
    if verbose:
        logger.info("FilterDellyCalls: We have finished filtering: %s file", inputVcf)
        logger.info("FilterFellyCalls: Output hass been written in: %s file", outputFile)
    return outputFile