Ejemplo n.º 1
0
def get_rna_genes(anRnaGeneFile, anRnaGeneFamilyFile, anIsDebug):
    '''
    ' This function parses the RNA gene and RNA gene family blacklist files.
    '
    ' anRnaGeneFile:  An RNA gene file
    ' anRnaGeneFamilyFile:  An RNA gene family file
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # open the file
    geneFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFile)
    geneFamilyFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFamilyFile)
    rnaGeneList = list()
    rnaGeneFamilyList = list()

    for line in geneFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)

        rnaGeneList.append(line)

    for line in geneFamilyFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)

        rnaGeneFamilyList.append(line)

    geneFileHandler.close()
    geneFamilyFileHandler.close()

    return rnaGeneList, rnaGeneFamilyList
Ejemplo n.º 2
0
def get_rna_genes(anRnaGeneFile, anRnaGeneFamilyFile, anIsDebug):
    '''
    ' This function parses the RNA gene and RNA gene family blacklist files.
    '
    ' anRnaGeneFile:  An RNA gene file
    ' anRnaGeneFamilyFile:  An RNA gene family file
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # open the file
    geneFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFile)
    geneFamilyFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFamilyFile)
    rnaGeneList = list()
    rnaGeneFamilyList = list()

    for line in geneFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)

        rnaGeneList.append(line)

    for line in geneFamilyFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)

        rnaGeneFamilyList.append(line)

    geneFileHandler.close()
    geneFamilyFileHandler.close()

    return rnaGeneList, rnaGeneFamilyList
Ejemplo n.º 3
0
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    '''
    ' This function parses the output from BLAT.  Two formats are supported:  BLAST NCBI-8 and PSL.  It groups 
    ' all of the information from one query sequence and uses the python generator to yield the information.  
    ' It ignores empty lines and strips trailing \r\n characters.
    '
    ' aBlatFile:  A output file from BLAT
    ' anOutputFormat:  BLAST or PSL
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(dict)

    for line in fileHandler:

        # we can ignore the lines that start with # for now
        if (line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data = rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        blatSplitId = blatId.split("_")
        prefix = blatSplitId[0]
        coordinateId = "_".join(blatSplitId[1:3])
        readId = "_".join(blatSplitId[0:4])

        if coordinateId not in blatHitsDict:
            blatHitsDict[coordinateId] = collections.defaultdict(dict)
        if prefix not in blatHitsDict[coordinateId]:
            blatHitsDict[coordinateId][prefix] = collections.defaultdict(list)

        blatHitsDict[coordinateId][prefix][readId].append(line)

    fileHandler.close()
    return blatHitsDict
Ejemplo n.º 4
0
def get_vcf_data(aVCFFile, anIsDebug):

    headerList = list()
    chromLine = None
    infoList = list()
    filterList = list()
    coordinateDict = dict()

    vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFile)

    for line in vcfFileHandler:

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("vcfLine: %s", line)

        # if we find the FILTER section, then record the filters
        if (line.startswith("##FILTER")):
            filterList.append(line)

        # if we find the INFO section, then record the info
        elif (line.startswith("##INFO")):
            infoList.append(line)

        # if we find the header line section
        elif (line.startswith("#CHROM")):
            chromLine = line

        # if we find the header line section
        elif (line.startswith("#")):
            headerList.append(line)

        # now we are to the data
        else:
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            # chrom = splitLine[0]
            stopCoordinate = splitLine[1]
            coordinateDict[stopCoordinate] = line + "\n"

    return (headerList, chromLine, infoList, filterList, coordinateDict)
Ejemplo n.º 5
0
    def load_from_file(self, fname, ci=0, sti=1, spi=2, vi=3):

        inFile = radiaUtil.get_read_fileHandler(fname)

        for line in inFile:
            data = line[:-1].split('\t')

            c = data[ci]
            st = int(data[sti])
            sp = int(data[spi])

            if len(data) < 4:
                v = ''
            else:
                v = data[vi]

            self.load_bins((c, st, sp, v))

        inFile.close()
Ejemplo n.º 6
0
    def load_from_file(self, fname, ci=0, sti=1, spi=2, vi=3):

        inFile = radiaUtil.get_read_fileHandler(fname)

        for line in inFile:
            data = line[:-1].split('\t')

            c = data[ci]
            st = int(data[sti])
            sp = int(data[spi])

            if len(data) < 4:
                v = ''
            else:
                v = data[vi]

            self.load_bins((c, st, sp, v))

        inFile.close()
Ejemplo n.º 7
0
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug):
    '''
    ' This function reads from a VCF input file and uses the python generator
    ' to yield the information one line at a time.  It ignores empty lines and
    ' strips trailing \r\n characters.  This function yields all the
    ' information from the VCF file.
    '
    ' aVcfFile:         A VCF file
    ' aPassOnlyFlag:    If all calls should be processed or only those calls
    '                   that passed the filters thus far
    ' anIsDebug:         A flag for outputting debug messages to STDERR
    '''

    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("VCF: %s", line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        if (aPassOnlyFlag and "PASS" not in line):
            continue

        yield line

    fileHandler.close()
    return
Ejemplo n.º 8
0
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug):
    '''
    ' This function reads from a VCF input file and uses the python generator
    ' to yield the information one line at a time.  It ignores empty lines and
    ' strips trailing \r\n characters.  This function yields all the
    ' information from the VCF file.
    '
    ' aVcfFile:         A VCF file
    ' aPassOnlyFlag:    If all calls should be processed or only those calls
    '                   that passed the filters thus far
    ' anIsDebug:         A flag for outputting debug messages to STDERR
    '''

    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("VCF: %s", line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        if (aPassOnlyFlag and "PASS" not in line):
            continue

        yield line

    fileHandler.close()
    return
Ejemplo n.º 9
0
def merge_vcf_data(aDnaFile, anRnaFile, anOverlapsFile,
                   aNonOverlapsFile, anIsDebug):

    # open the header file
    dnaFileHandler = radiaUtil.get_read_fileHandler(aDnaFile)
    rnaFileHandler = radiaUtil.get_read_fileHandler(anRnaFile)
    overlapsFileHandler = radiaUtil.get_read_fileHandler(anOverlapsFile)
    if (os.path.isfile(aNonOverlapsFile)):
        nonOverlapsFileHandler = radiaUtil.get_read_fileHandler(
                                                            aNonOverlapsFile)

    headerList = list()
    coordinateDict = dict()

    # the dna file has the results from the dna mpileup filter
    # the rna file has the results from the rna mpileup filter
    # the overlaps file has calls that pass in both the DNA and RNA
    # the non-overlaps file originally has calls that don't pass in the DNA but
    # pass in the RNA, these RNA calls are further filtered to eliminate
    # possible germline (dnm*/DB/GERM) calls or false positives due to
    # pseudogens (EGPS/RTPS) and then the RNA reads are optionally run through
    # the blat filter to check for mapping uniqueness - these are the
    # RNA Rescue and RNA Editing calls

    # process all of the calls from the DNA mpileup filter
    for line in dnaFileHandler:

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug and not line.startswith("#")):
            logging.debug("DNA mpileup Line: %s", line)

        # if it is a header line, then add it to the header list
        if (line.startswith("#")):

            # keep all the header lines
            headerList.append(line + "\n")

        # now we are to the data
        else:
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            stopCoordinate = splitLine[1]
            coordinateDict[stopCoordinate] = line + "\n"

    # these are all the calls that pass in both the DNA and RNA
    for line in overlapsFileHandler:

        # if it is an empty line, then continue
        # if it is a header line, then continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug and not line.startswith("#")):
            logging.debug("Overlaps file Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # the coordinate is the second element
        stopCoordinate = splitLine[1]

        # if the call passed in both the RNA and DNA,
        # then adjust the origin
        if (stopCoordinate in coordinateDict):
            dnaLine = coordinateDict[stopCoordinate]
            if (anIsDebug):
                logging.debug("passed in both RNA and DNA (from overlaps " +
                              "file) changing the origin to (DNA,RNA) " +
                              "\nDNALine: %s RNALine: %s\n",
                              dnaLine, line)
            dnaLine = dnaLine.replace("ORIGIN=DNA", "ORIGIN=DNA,RNA")
            coordinateDict[stopCoordinate] = dnaLine
        else:
            coordinateDict[stopCoordinate] = line + "\n"

    # loop through the RNA mpileup filtered calls
    # create 2 dictionaries:  one for passing, one for non-passing
    #
    # if an RNA Rescue or RNA Editing call passes in the anRnaNonOverlapsFile
    # below, then we want to use the original RNA mpileup passing call to
    # overwrite the DNA call. the non-overlaps file is really the RNA mpileup
    # passing calls that are first filtered by DNA, then grep, and then blat.
    # the filtered by DNA part doesn't select one modType when no call passes,
    # so the final passing call has more than one modType which causes problems
    # in the next filter, therefore use the RNA mpileup passing call.
    #
    # mpileup_rna_origin:
    # 9       17464495        .       G       A       0.0     PASS
    #    AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A;
    #    MT=TUM_EDIT;NS=3;ORIGIN=RNA;SB=0.73;SS=5;START=0;STOP=5;VT=SNP
    #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
    #    0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0
    #    0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0
    #    0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5
    # vs.
    # mpileup_rna_origin->dnaFiltered->blat:
    # 9       17464495        .       G       A       0.0     PASS
    #    AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A,G>A;
    #    MF=rnacall,dtmnab_dtmnbq;MFT=DNA_TUM_EDIT_G>A,DNA_SOM_G>A;
    #    MT=TUM_EDIT,SOM;NS=3;ORIGIN=RNA;SB=0.73;SS=5;START=0;STOP=5;VT=SNP
    #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
    #    0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0
    #    0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0
    #    0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5
    #
    # when merging a call that passes in the non-overlaps (dnaFiltered or blat)
    # file, replace the DNA call, with the original RNA mpileup passing call
    #
    # the non-passing dictionary will be used below to help merge filtered
    # calls when a call gets filtered by both the RNA and DNA

    rnaMpileupPassingDict = {}
    rnaMpileupNonpassingDict = {}
    for rnaLine in rnaFileHandler:

        # if it is an empty line, then just continue
        # if it is a header line, then just continue
        if (rnaLine.isspace() or rnaLine.startswith("#")):
            continue

        # strip the carriage return and newline characters
        rnaLine = rnaLine.rstrip("\r\n")

        if (anIsDebug and not rnaLine.startswith("#")):
            logging.debug("RNA mpileup Line: %s", rnaLine)

        # now we are to the data
        # split the line on the tab
        rnaLineSplit = rnaLine.split("\t")

        # the coordinate is the second element
        stopCoordinate = rnaLineSplit[1]

        # put the call in the right dict
        if "PASS" in rnaLineSplit[6]:
            rnaMpileupPassingDict[stopCoordinate] = rnaLine
        else:
            rnaMpileupNonpassingDict[stopCoordinate] = rnaLine

    # these are the RNA Rescue and RNA Editing calls after
    # the initial filtering but before filterByReadSupport.py
    if (os.path.isfile(aNonOverlapsFile)):
        for line in nonOverlapsFileHandler:

            # if it is an empty line, then just continue
            # if it is a header line, then just continue
            if (line.isspace() or line.startswith("#")):
                continue

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug and not line.startswith("#")):
                logging.debug("Non-overlaps Line: %s", line)

            # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            stopCoordinate = splitLine[1]

            # if this call passed in the RNA, then overwrite
            # the DNA call that didn't pass
            if ("PASS" in splitLine[6]):
                # if this call existed in the DNA
                if (stopCoordinate in coordinateDict):
                    dnaLine = coordinateDict[stopCoordinate]
                    # get the RNA line from the RNA mpileups passing dict
                    rnaLine = rnaMpileupPassingDict[stopCoordinate]
                    # if it didn't pass in the DNA
                    if ("PASS" not in dnaLine):
                        if (anIsDebug):
                            logging.debug("Overwriting non-passing DNA call " +
                                          "with passing RNA Rescue calls " +
                                          "\nDNALine: %s" +
                                          "RNALineNonOverlaps: %s" +
                                          "\nRNALineMpileup: %s\n",
                                          dnaLine, line, rnaLine)
                        coordinateDict[stopCoordinate] = rnaLine + "\n"
                    else:
                        if (anIsDebug):
                            # this call passed in both
                            logging.debug("Unusual call in non-overlaps " +
                                          "file passed in both the RNA and " +
                                          "DNA but they probably don't have " +
                                          "the same modType! \nDNALine: %s " +
                                          "RNALineNonOverlaps: %s" +
                                          "\nRNALineMpileup: %s\n",
                                          dnaLine, line, rnaLine)
                        # at this point, there are multiple events that pass
                        # all the filters. in this case, pick the passing
                        # event in the following order:
                        # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH
                        if ("GERM" in dnaLine or "SOM" in dnaLine):
                            coordinateDict[stopCoordinate] = dnaLine
                        else:
                            coordinateDict[stopCoordinate] = rnaLine
                # this call didn't exist in the DNA
                else:
                    logging.warning("Call didn't exist in DNA? " +
                                    "RNALine: %s\n", line)
                    coordinateDict[stopCoordinate] = line + "\n"
            # this call didn't pass in the RNA
            else:
                if (anIsDebug):
                    logging.debug("Call didn't pass in RNA: " +
                                  "RNALine: %s\n", line)

                # if this call existed in the DNA
                if (stopCoordinate in coordinateDict):
                    dnaLine = coordinateDict[stopCoordinate]
                    # if it didn't pass in the DNA
                    if ("PASS" not in dnaLine):
                        if (anIsDebug):
                            logging.debug("RNANoPass:  Didn't pass in both, " +
                                          "so change origin and merge " +
                                          "filters \nDNALine: %s " +
                                          "RNALine: %s\n", dnaLine, line)
                        # change origin
                        if ("ORIGIN=DNA,RNA" not in dnaLine):
                            dnaLine = dnaLine.replace("ORIGIN=DNA",
                                                      "ORIGIN=DNA,RNA")
                        dnaLine = dnaLine.rstrip("\r\n")
                        dnaLineSplit = dnaLine.split("\t")

                        # merge the filters for the FILTER column
                        dnaLineSplit[6] = merge_filters(splitLine[6],
                                                        dnaLineSplit[6])

                        # merge the mod filters and filter types
                        # in the INFO column
                        dnaLineSplit[7] = merge_mod_filters(
                                                        splitLine[7],
                                                        dnaLineSplit[7])

                        newDnaLine = "\t".join(dnaLineSplit) + "\n"
                        coordinateDict[stopCoordinate] = newDnaLine

                        if (anIsDebug):
                            logging.debug("RNANoPass:  After change origin " +
                                          "and merge filters " +
                                          "\nFinalLine: %s\n",
                                          "\t".join(dnaLineSplit))
                    else:
                        # this call passed in both:
                        # DNALine: 17 4857042 .   T   A,G,C   0.0 PASS
                        #    AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4;
                        #    BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>A;
                        #    MT=GERM;NS=3;ORG_ISO_AD=16_2_1_2615,
                        #    18_3_1_2791,18_1_2_2805;ORIGIN=DNA;
                        #    RS_GEN_POS=17:4854383-4860426,
                        #    17:4854383-4860426,17:4854383-4860426;
                        #    RS_NAME=NM_001193503,NM_001976,NM_053013;
                        #    RS_ORG_POS=313,484,442;RS_STRAND=+,+,+;
                        #    SB=0.74;SS=1;START=1;STOP=0;VT=SNP
                        #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
                        #    0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0:
                        #        29,28,3,0:0.39,0.5,1.0,0.0
                        #    0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0:
                        #        31,0,0,0:0.56,0.0,0.0,0.0
                        #    3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0:
                        #        45,12,12,58:0.94,1.0,1.0,0.97
                        # RNALine: 17    4857042 .   T   A,G,C   0.0 PASS
                        #    AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4;
                        #    BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>C;
                        #    MT=TUM_EDIT;NS=3;ORG_ISO_AD=16_2_1_2615,
                        #    18_3_1_2791,18_1_2_2805;ORIGIN=RNA;
                        #    RS_GEN_POS=17:4854383-4860426,
                        #    17:4854383-4860426,17:4854383-4860426;
                        #    RS_NAME=NM_001193503,NM_001976,NM_053013;
                        #    RS_ORG_POS=313,484,442;RS_STRAND=+,+,+;
                        #    SB=0.74;SS=5;START=1;STOP=0;VT=SNP
                        #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
                        #    0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0:
                        #        29,28,3,0:0.39,0.5,1.0,0.0
                        #    0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0:
                        #        31,0,0,0:0.56,0.0,0.0,0.0
                        #    3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0:
                        #        45,12,12,58:0.94,1.0,1.0,0.97
                        logging.warning("RNANoPass:  Call passed in both " +
                                        "RNA and DNA but they probably " +
                                        "don't have the same modType " +
                                        "\nDNALine: %s RNALine: %s\n",
                                        dnaLine, line)
                        # at this point, there are multiple events that
                        # pass all the filters. in this case, pick the
                        # passing event in the following order:
                        # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH
                        if ("GERM" in dnaLine or "SOM" in dnaLine):
                            coordinateDict[stopCoordinate] = dnaLine
                        else:
                            coordinateDict[stopCoordinate] = line
                # this call didn't exist in the DNA
                else:
                    logging.warning("RNANoPass:  Call didn't exist in DNA? " +
                                    "RNALine: %s\n", line)
                    coordinateDict[stopCoordinate] = line + "\n"

    # these are needed for merging the RNA mpileup filters
    for (rnaStopCoordinate, rnaLine) in rnaMpileupNonpassingDict.iteritems():

        if (anIsDebug and not rnaLine.startswith("#")):
            logging.debug("RNA mpileup non-passing Line: %s", rnaLine)

        # if this call existed in the DNA and
        # the user wants the merged calls
        if (rnaStopCoordinate in coordinateDict):

            # split the line on the tab
            rnaLineSplit = rnaLine.split("\t")

            # get the original line
            dnaLine = coordinateDict[rnaStopCoordinate]
            dnaLine = dnaLine.rstrip("\r\n")
            dnaLineSplit = dnaLine.split("\t")

            # if the call didn't pass in the RNA or DNA,
            # we want to merge the filters
            if "PASS" not in dnaLineSplit[6]:
                if (anIsDebug):
                    logging.debug("Merging filters for \nDNALine: %s " +
                                  "\nRNALine: %s", dnaLine, rnaLine)

                # merge the filters for the FILTER column
                dnaLineSplit[6] = merge_filters(rnaLineSplit[6],
                                                dnaLineSplit[6])

                # merge the mod filters and filter types in the INFO column
                dnaLineSplit[7] = merge_mod_filters(rnaLineSplit[7],
                                                    dnaLineSplit[7])

                finalLine = "\t".join(dnaLineSplit)
                if ("ORIGIN=DNA,RNA" not in finalLine):
                    finalLine = finalLine.replace("ORIGIN=DNA",
                                                  "ORIGIN=DNA,RNA")

                coordinateDict[rnaStopCoordinate] = finalLine + "\n"
                if (anIsDebug):
                    logging.debug("Merged filters \nFinalLine: %s", finalLine)
        # this call didn't exist in the DNA
        else:
            coordinateDict[rnaStopCoordinate] = rnaLine + "\n"

    dnaFileHandler.close()
    rnaFileHandler.close()
    overlapsFileHandler.close()
    if (os.path.isfile(aNonOverlapsFile)):
        nonOverlapsFileHandler.close()

    return (headerList, coordinateDict)
Ejemplo n.º 10
0
def get_vcf_data(anId, anInputDir, anIsDebug):

    # for each file that starts with this id
        # load the first file to get the header
        # get the coordinates for all

    processedHeader = False
    headerDict = dict()
    headerDict["metadata"] = list()
    headerDict["format"] = list()
    headerDict["info"] = list()
    headerDict["filter"] = list()
    headerDict["chrom"] = list()
    coordinateDict = dict()
    coordinateDict["numbers"] = dict()
    coordinateDict["letters"] = dict()

    # if the input directory doesn't end with a forward slash,
    # then add one so that glob.glob will work
    if (not anInputDir.endswith("/")):
        anInputDir = anInputDir + "/"

    # for each vcf file
    # they might be gzipped, they might not
    for vcfFile in (glob.glob(anInputDir + anId + "_chr*.vcf*")):

        # open the file
        vcfFileHandler = radiaUtil.get_read_fileHandler(vcfFile)

        for line in vcfFileHandler:

            # if it is an empty line, then just continue
            if (line.isspace()):
                continue

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug):
                logging.debug("vcfLine: %s", line)

            # if we haven't processed the header yet, then do it here
            if (not processedHeader):
                # extract the metadata
                if (line.startswith("##FORMAT")):
                    headerDict["format"].append(line)
                elif (line.startswith("##INFO")):
                    headerDict["info"].append(line)
                elif (line.startswith("##FILTER")):
                    headerDict["filter"].append(line)
                elif (line.startswith("##")):
                    headerDict["metadata"].append(line)
                elif (line.startswith("#CHROM")):
                    headerDict["chrom"].append(line)
                    # now we've processed the header
                    processedHeader = True

            if (line.startswith("#")):
                continue
            else:
                # split the line on the tab
                splitLine = line.split("\t")

                # the coordinate is the second element
                chrom = splitLine[0]

                # we want to sort everything at the end, so keep track
                # of the chroms that are numbers and letters separately
                if (is_number(chrom)):
                    if chrom not in coordinateDict["numbers"]:
                        coordinateDict["numbers"][chrom] = list()
                    coordinateDict["numbers"][chrom].append(line)
                else:
                    if chrom not in coordinateDict["letters"]:
                        coordinateDict["letters"][chrom] = list()
                    coordinateDict["letters"][chrom].append(line)

        # close the file and move onto the next one
        vcfFileHandler.close()

    return (headerDict, coordinateDict)
Ejemplo n.º 11
0
def get_validation_data(anInputFilename, aStatsDict, aCompareDict,
                        aPrefix, anIsDebug):
    '''
    ' The validation files must have at least 10 fields:  chrom, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or
            line.startswith("#") or
            line.startswith("chrom")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("Validation Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # get the fields to yield
        # columnHeaders = ["chrom", "chr_start", "chr_stop",
        #                  "ref", "var", "source", "val_result"]
        # these are 0-based
        chrom = splitLine[0]
        # startCoordinate = splitLine[1]
        stopCoordinate = splitLine[2]
        # ref = splitLine[3]
        # variantAllele = splitLine[4]
        # center = splitLine[5]
        # valResult = splitLine[6]

        # add the coordinate to the output
        outputDict[chrom + "_" + stopCoordinate] = line

        # keep track of the number of total events per file
        aStatsDict[aPrefix + "_events"] += 1

        # all events are considered passing events
        aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. their can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        if (cmpKey in line):
                            statKey = aPrefix + "_pass_" + cmpKeyString
                            aStatsDict[statKey] += 1
                        # only count it once
                        break

    inputFileHandler.close()

    return (outputDict, aStatsDict)
Ejemplo n.º 12
0
def get_vcf_data(anInputFilename, aStatsDict, aCompareDict,
                 aPrefix, anIsDebug):
    '''
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("VCF Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # get the fields to yield
        # columnHeaders = ["CHROM", "POS", "ID", "REF", "ALT",
        #                  "QUAL", "FILTER", "INFO", "FORMAT"]
        chrom = splitLine[0]
        stopCoordinate = splitLine[1]

        # outputDict[chrom + "_" + stopCoordinate] = line

        # keep track of the number of total events per file
        # aStatsDict[aPrefix + "_events"] += 1

        # if ("PASS" in line):
        #    aStatsDict[aPrefix + "_pass_events"] += 1

        # the thing that is being compared to has to have the
        # smaller/limited amount i.e. only the passing som events,
        # otherwise everything will be found
        # if (aPrefix == "rad" and "SNP" in line):
        # if (aPrefix == "rad"):
        if (aPrefix == "rad" and "PASS" in line and "SNP" in line and
            ("SOM" in line or "EDIT" in line or
             "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            # add the coordinate to the output
            outputDict[chrom + "_" + stopCoordinate] = line
        # elif (aPrefix == "cmp" and "PASS" in line):
        # elif (aPrefix == "cmp" and "SNP" in line):
        elif (aPrefix == "cmp" and "PASS" in line and "SNP" in line and
              ("SOM" in line or "EDIT" in line or
               "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            outputDict[chrom + "_" + stopCoordinate] = line

        # if ("PASS" in line and "Somatic" in line and "SNP" in line):
        # if ("SOM" in line):
        #    outputDict[chrom + "_" + stopCoordinate] = line

        #    # keep track of the number of total events per file
        #    aStatsDict[aPrefix + "_events"] += 1

        #    if ("PASS" in line):
        #        aStatsDict[aPrefix + "_pass_events"] += 1

        # if ("PASS" in line and "SOM" in line):
        # if ("SOM" in line):
        #    outputDict[chrom + "_" + stopCoordinate] = line
        #    aStatsDict[aPrefix + "_events"] += 1
        #    aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. there can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "rad"):
                # break up the string to get the individual keys
                radKeyList = radKeyString.split(",")
                # search for each one of them
                for radKey in radKeyList:
                    # if we find one
                    if (radKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + radKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #     (radKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and
                        #     (radKey == "Germline" or "DB" not in line)):
                        # if ("PASS" in line):
                        # if (radKey in line):
                        # if ("PASS" in line and "SNP" in line):
                        if ("PASS" in line and
                            "SNP" in line and
                            radKey in line):
                            statKey = aPrefix + "_pass_" + radKeyString
                            aStatsDict[statKey] += 1
                        # only count it once
                        break

            elif (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        # if ("SNP" in line):
                        #    aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #    (cmpKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and "SNP" in line and
                        #    (cmpKey == "Germline" or "DB" not in line)):
                        # if ("PASS" in line and "SNP" in line and
                        #    "SS=2" in line and cmpKey in line):
                        # if ("PASS" in line):
                        if ("PASS" in line and
                            "SNP" in line and
                            cmpKey in line):
                            statKey = aPrefix + "_pass_" + cmpKeyString
                            aStatsDict[statKey] += 1
                        # only count it once
                        break

    inputFileHandler.close()

    return (outputDict, aStatsDict)
Ejemplo n.º 13
0
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug):
    '''
    ' This function reads from a .vcf input file and uses the python generator to yield the information
    ' one line at a time.  It ignores empty lines and strips trailing \r\n characters.  This function
    ' yields all the information from the VCF file.
    '
    ' aVcfFile:  A VCF file
    ' aPassOnlyFlag:  If all calls should be processed or only those calls that passed the filters thus far
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)

    for line in fileHandler:

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("VCF: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # these are header lines, so just continue
        elif (line.startswith("#")):
            continue

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (aPassOnlyFlag and "PASS" not in line):
            continue

        # split the line on the tab
        splitLine = line.split("\t")

        # the coordinate is the second element
        chrom = splitLine[0]
        stopCoordinate = int(splitLine[1])
        idList = splitLine[2]
        refList = splitLine[3]
        altList = splitLine[4]
        score = splitLine[5]
        filterSet = set(splitLine[6].split(";"))
        infoList = splitLine[7].split(";")
        infoDict = collections.defaultdict(list)
        for info in infoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
            if (len(keyValueList) == 1):
                infoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                infoDict[keyValueList[0]] = keyValueList[1].split(",")

        # yield all the information about the current coordinate
        yield (chrom, stopCoordinate, idList, refList, altList, score,
               filterSet, infoDict, "\t".join(splitLine[8:]))

    fileHandler.close()
    return
Ejemplo n.º 14
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help=
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"
    )

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [
        i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename
    ]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename, i_debug)

    hasAddedHeader = False
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)
    vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n"
    vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n"

    for line in i_vcfFileHandler:

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue
        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedHeader)
              and (line.startswith("##FILTER") or line.startswith("##INFO"))):
            hasAddedHeader = True
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(vcfHeader)
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, vcfHeader
                print >> sys.stdout, line

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # now we are to the data
        else:

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                #effectImpact = effectParts[0]
                #functionalClass = effectParts[1]
                #codonChange = effectParts[2]
                #aaChange = effectParts[3]
                #aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                #geneCoding = effectParts[7]
                #ensembleId = effectParts[8]
                #exonNumber = effectParts[9]
                #genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename != None):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return
Ejemplo n.º 15
0
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict,
                        aPrefix, anIsDebug):
    '''
    ' The simulation files have 11 fields:  mutation type, chrom, start, end,
    ' target AF, mutation position, base change, coverage in, coverage out,
    ' actual AF, highest AF of anything linked. The useful ones for comparing
    ' against RADIA are chromosome, mutation position, and base change.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("Simulation Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # mutType = splitLine[0]
        chrom = splitLine[1]
        # startCoordinate = splitLine[2]
        # stopCoordinate = splitLine[3]
        # targetAF = splitLine[4]
        mutPosition = splitLine[5]
        # baseChange = splitLine[6]
        # coverageIn = splitLine[7]
        # coverageOut = splitLine[8]
        # actualAF = splitLine[9]
        # highestAF = splitLine[10]

        if (chrom + "_" + mutPosition) in outputDict:
            logging.debug(line + outputDict[chrom + "_" + mutPosition])

        # add the coordinate to the output
        outputDict[chrom + "_" + mutPosition] = line

        # keep track of the number of total events per file
        aStatsDict[aPrefix + "_events"] += 1

        # all events are considered passing events
        aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. their can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                        # only count it once
                        break

    inputFileHandler.close()

    return (outputDict, aStatsDict)
Ejemplo n.º 16
0
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    '''
    ' This function parses the output from BLAT.  Two formats are supported:
    ' BLAST NCBI-8 and PSL.  It groups all of the information from one query
    ' sequence and uses the python generator to yield the information.  It
    ' ignores empty lines and strips trailing \r\n characters.
    '
    ' aBlatFile:         A output file from BLAT
    ' anOutputFormat:    BLAST or PSL
    ' anIsDebug:         A flag for outputting debug messages to STDERR
    '''

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(list)
    previousPrefix = ""

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data =
        # rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            # the PSL output has a bunch of header lines that we want to skip
            # if the first column can't be converted into an int, then skip
            try:
                int(splitLine[0])
            except ValueError:
                continue
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        prefix = "_".join(blatId.split("_")[0:3])
        readId = "_".join(blatId.split("_")[0:4])

        # this catches all of the matches except the first one
        if (prefix == previousPrefix):
            blatHitsDict[readId].append(line)
            '''
            if (anIsDebug):
                logging.debug("prefixes match, current=%s, prev=%s",
                              prefix, previousPrefix)
            '''
        # if the prefixes don't match and the blatHitsDict is not empty:
        # we've reached a new set of blat hits, so yield the previous ones
        elif blatHitsDict:
            '''
            if (anIsDebug):
                logging.debug("new prefix=%s, prev=%s", prefix, previousPrefix)
                logging.debug("yielding len blatHits=%s", len(blatHitsDict))
            '''
            # yield the blat hits for this prefix
            yield blatHitsDict
            # clear the blat hits dict for the next matches
            blatHitsDict.clear()
            # set the previous prefix and start filling
            # the dict with the first prefix
            previousPrefix = prefix
            blatHitsDict[readId].append(line)
            '''
            if (anIsDebug):
                logging.debug("after yield current=%s, prev=%s",
                              prefix, previousPrefix)
            '''
        # if the prefixes don't match, and the blatHitsDict is empty:
        # this is the first line of the VCF, set the previous prefix
        # and add it to the blatHitsDict
        else:
            blatHitsDict[readId].append(line)
            previousPrefix = prefix

    # this one is needed to yield the very last blatHitsDict when all
    # lines of the VCF have been processed
    yield blatHitsDict
    return
Ejemplo n.º 17
0
def get_maf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    '''
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
     
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("MAF Line: %s", line)
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
                
        # these lines are from previous scripts in the pipeline, so output them    
        elif (line.startswith("#")):
            continue;
        
        # now we are to the data
        else:    
            
            # split the line on the tab
            splitLine = line.split("\t")
            
            # get the fields to yield
            #center = splitLine[2]
            chrom = splitLine[4]
            #startCoordinate = splitLine[5]
            stopCoordinate = splitLine[6]
            #variantType = splitLine[9]
            #dbSnp = splitLine[13]
            
            #if ("Somatic" in line and "SNP" in line):
            if (True):
                #if (chrom + "_" + stopCoordinate) in outputDict:
                #    logging.debug(line + outputDict[chrom + "_" + stopCoordinate])
                
                # add the coordinate to the output
                outputDict[chrom + "_" + stopCoordinate] = line
                
                # keep track of the number of total events per file 
                aStatsDict[aPrefix + "_events"] += 1
                
                # all events are considered passing events
                aStatsDict[aPrefix + "_pass_events"] += 1
                
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "rad"):
                    # break up the string to get the individual keys
                    radKeyList = radKeyString.split(",")
                    # search for each one of them
                    for radKey in radKeyList:
                        # if we find one
                        if (radKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + radKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((radKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and ((radKey == "Germline") or ("DB" not in line))):
                            #if ("PASS" in line and radKey in line):
                            #if ("SNP" in line):
                            if ("SOMATIC" in line):
                                aStatsDict[aPrefix + "_pass_" + radKeyString] += 1
                            # only count it once
                            break;
                        
                elif (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((cmpKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and "SNP" in line and ((cmpKey == "Germline") or ("DB" not in line))):
                            #if ("SNP" in line):
                            #if ("PASS" in line):
                            if ("SOMATIC" in line):
                                aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
                            break;
                        
    inputFileHandler.close()
        
    return (outputDict, aStatsDict)
Ejemplo n.º 18
0
def get_maf_data(anInputFilename, aStatsDict, aCompareDict,
                 aPrefix, anIsDebug):
    '''
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("MAF Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # get the fields to yield
        # center = splitLine[2]
        chrom = splitLine[4]
        # startCoordinate = splitLine[5]
        stopCoordinate = splitLine[6]
        # variantType = splitLine[9]
        # dbSnp = splitLine[13]

        # if ("Somatic" in line and "SNP" in line):
        if (True):
            # coordinateKey = chrom + "_" + stopCoordinate
            # if (coordinateKey) in outputDict:
            #    logging.debug(line + outputDict[coordinateKey])

            # add the coordinate to the output
            outputDict[chrom + "_" + stopCoordinate] = line

            # keep track of the number of total events per file
            aStatsDict[aPrefix + "_events"] += 1

            # all events are considered passing events
            aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. their can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "rad"):
                # break up the string to get the individual keys
                radKeyList = radKeyString.split(",")
                # search for each one of them
                for radKey in radKeyList:
                    # if we find one
                    if (radKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + radKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #    (radKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and
                        #    (radKey == "Germline" or "DB" not in line)):
                        # if ("PASS" in line and radKey in line):
                        # if ("SNP" in line):
                        if ("SOMATIC" in line):
                            statKey = aPrefix + "_pass_" + radKeyString
                            aStatsDict[statKey] += 1
                        # only count it once
                        break

            elif (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #    (cmpKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and "SNP" in line and
                        #    (cmpKey == "Germline" or "DB" not in line)):
                        # if ("SNP" in line):
                        # if ("PASS" in line):
                        if ("SOMATIC" in line):
                            statKey = aPrefix + "_pass_" + cmpKeyString
                            aStatsDict[statKey] += 1
                        # only count it once
                        break

    inputFileHandler.close()

    return (outputDict, aStatsDict)
Ejemplo n.º 19
0
def get_vcf_data(aVcfFile, aHeaderFile, aPassOnlyFlag, anIsDebug):
    '''
    ' This function reads from a .vcf input file and uses the python generator to yield the information
    ' one line at a time.  It ignores empty lines and strips trailing \r\n characters.  This function
    ' yields all the information from the VCF file.
    '
    ' aVcfFile:  A VCF file
    ' aPassOnlyFlag:  If all calls should be processed or only those calls that passed the filters thus far
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    # open the header file
    fileHandler = radiaUtil.get_read_fileHandler(aHeaderFile)
     
    for line in fileHandler:
          
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("VCF Header: %s", line)    
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
        
        # if we find the column headers
        elif ("#CHROM" in line):
            columnsLine = line.lstrip("#")
            columnsList = columnsLine.split("\t")
            columnsList = columnsList[9:len(columnsList)]
            continue
        
        # if we find the vcfGenerator line, then create the dict of params
        elif ("vcfGenerator" in line):
            #generatorLine = line.rstrip(">")
            #generatorLine = generatorLine.lstrip("##vcfGenerator=<")
            generatorLine = line[0:(len(line)-1)]
            #print "generatorLine: %s", generatorLine
            generatorLine = generatorLine[16:len(generatorLine)]
            #print "generatorLine: %s", generatorLine
            generatorParamsList = generatorLine.split(",")
            generatorParamsDict = {}
            
            # create a dictionary of existing params
            for param in generatorParamsList:
                (key, value) = param.split("=")
                value = value.rstrip(">")
                value = value.lstrip("<")
                generatorParamsDict[key] = value
            continue
        
        # if we are done with the header, then stop    
        elif (not line.startswith("#")):
            break
        
    fileHandler.close()
    
    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)
     
    for line in fileHandler:
          
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("VCF: %s", line)    
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
        
        # if we find the column headers
        elif ("#CHROM" in line):
            columnsLine = line.lstrip("#")
            columnsList = columnsLine.split("\t")
            columnsList = columnsList[9:len(columnsList)]
            continue
        
        # if we find the vcfGenerator line, then create the dict of params
        elif ("vcfGenerator" in line):
            #generatorLine = line.rstrip(">")
            #generatorLine = generatorLine.lstrip("##vcfGenerator=<")
            generatorLine = line[0:(len(line)-1)]
            #print "generatorLine: %s", generatorLine
            generatorLine = generatorLine[16:len(generatorLine)]
            #print "generatorLine: %s", generatorLine
            generatorParamsList = generatorLine.split(",")
            generatorParamsDict = {}
            
            # create a dictionary of existing params
            for param in generatorParamsList:
                (key, value) = param.split("=")
                value = value.rstrip(">")
                value = value.lstrip("<")
                generatorParamsDict[key] = value
            continue
                
        # these are header lines, so just continue    
        elif (line.startswith("#")):
            continue
        
        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (aPassOnlyFlag and "PASS" not in line):    
            continue;

        # split the line on the tab
        splitLine = line.split("\t")

        # the coordinate is the second element
        chrom = splitLine[0]
        stopCoordinate = int(splitLine[1])
        idList = splitLine[2].split(";")
        refList = splitLine[3].split(",")
        altList = splitLine[4].split(",")
        score = float(splitLine[5])
        filterSet = set(splitLine[6].split(";"))
        infoList = splitLine[7].split(";")
        infoDict = collections.defaultdict(list)
        for info in infoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
            if (len(keyValueList) == 1):
                infoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                infoDict[keyValueList[0]] = keyValueList[1].split(",")
                
        # yield all the information about the current coordinate
        yield (chrom, stopCoordinate, idList, refList, altList, score, filterSet, infoDict, "\t".join(splitLine[8:]), generatorParamsDict)
    fileHandler.close()
    return
Ejemplo n.º 20
0
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename,
                  anOutputFilename, aFilterName, aFilterField,
                  anIncludeOverlapInfo, anIncludeFilterName, anIdField,
                  anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize,
                  anIsDebug):
    '''
    ' This function reads from a .bed file and a .vcf file line by line and
    ' looks for variants that should be filtered or tagged. The .bed file
    ' specifies coordinates for areas where variants should either be included
    ' or excluded.  For example, a .bed file specifying transcription or exon
    ' start and stop coordinates can be provided along with the
    ' --includeOverlaps flag to indicate that the variants in these regions
    ' should be kept, and variants outside of these regions should be flagged
    ' or filtered out.  Conversely, a bed file specifying areas of the genome
    ' that are accessible (as defined by the 1000 Genomes project) can be given
    ' without the --includeOverlaps flag to indicate that the variants outside
    ' of the accessible genome should be flagged or filtered out, and variants
    ' overlapping the accessible regions should not be flagged or filtered out.
    '
    ' aTCGAId: The TCGA Id for this sample
    ' aChrom: The chromosome being filtered
    ' aBedFilename: A .bed file with at least 3 columns specifying the chrom,
    '    start, and stop coordinates and possibly a 4th column with an id
    ' aVCFFilename: A .vcf file with variants that will be either
    '    included or excluded
    ' anOutputFilename: An output file where the filtered variants are output
    ' aFilterName: The name of the filter
    ' aFilterField: The field where the filter name should be included
    '    (e.g. INFO or FILTER)
    ' anIncludeOverlapInfo: A flag specifying whether the variants should be
    '    included or excluded when they overlap
    ' anIncludeFilterName: A flag specifying whether the filtering name should
    '    be included in the output or not
    ' anIdField: The field where the ID should be specified (e.g. ID or INFO)
    ' anIncludeId: A flag specifying whether the id should be included in the
    '    output or not
    ' anIncludeCount: A flag specifying whether the number of overlaps should
    '    be included in the output or not
    ' aFilterHeaderLine: A filter header line that should be added to the VCF
    '    header describing this filter
    ' aBinSize:  The size of the interval between each bin
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # initialize pybed with the filtering file
    filterPybed = pybed(binsize=aBinSize)
    filterPybed.load_from_file(aBedFilename)

    # get the vcf file
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename)

    # get the output file
    i_outputFileHandler = None
    if (anOutputFilename is not None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename)

    # create the generator for the vcf file
    vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler,
                                aFilterHeaderLine, anIsDebug)

    # initialize some variables
    overlappingEvents = 0
    nonOverlappingEvents = 0
    totalEvents = 0
    startTime = time.time()

    # for each vcf line
    for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref,
         vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine,
         vcf_line) in (vcfGenerator):

        totalEvents += 1

        if (anIsDebug):
            logging.debug("VCF: %s", vcf_line)

        # check if this vcf coordinate overlaps with the filter coordinates
        posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate)
        (isOverlap, idValue,
         count) = filterPybed.overlaps_with(posTuple, anIncludeCount)

        # if an event overlaps with the filters
        if (isOverlap):
            # count the overlap
            overlappingEvents += 1

            # if we want to add info about overlaps
            if (anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(
                        vcf_filter, vcf_info, aFilterName, aFilterField,
                        anIncludeCount, count, anIncludeId, anIdField, idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we don't want to add info about overlaps, just output them
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line
        # these events don't overlap with the filters
        else:
            # count the non overlap
            nonOverlappingEvents += 1

            # if we don't want to add info about overlaps,
            # then we do want to add info about non-overlaps
            if (not anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(
                        vcf_filter, vcf_info, aFilterName, aFilterField,
                        anIncludeCount, count, anIncludeId, anIdField, idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we do want to add info about overlaps,
            # so just output non-overlaps
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line

    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs",
                 aChrom, aTCGAId, ((stopTime - startTime) / (3600)),
                 ((stopTime - startTime) / 60), (stopTime - startTime))

    if (overlappingEvents + nonOverlappingEvents == totalEvents):
        logging.info(
            "For chrom %s and Id %s: %s (overlapping events) + " +
            "%s (non-overlapping events) = %s", aChrom, aTCGAId,
            overlappingEvents, nonOverlappingEvents, totalEvents)
    else:
        logging.info(
            "filterByPybed Warning: For chrom %s and Id %s: %s " +
            "(overlapping events) + %s (non-overlapping events) = %s", aChrom,
            aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents)

    # close the files
    i_vcfFileHandler.close()
    if (anOutputFilename is not None):
        i_outputFileHandler.close()
    return
Ejemplo n.º 21
0
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    '''
    ' The simulation files have 11 fields:  mutation type, chromosome, start, end, target AF, 
    ' mutation position, base change, coverage in, coverage out, actual AF, highest AF of anything 
    ' linked. The useful ones for comparing against RADIA are chromosome, mutation position, and base change.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
     
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("Simulation Line: %s", line)
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
                
        # these lines are from previous scripts in the pipeline, so skip them    
        elif (line.startswith("#")):
            continue;
        
        # now we are to the data
        else:    
            
            # split the line on the tab
            splitLine = line.split("\t")
            
            #mutType = splitLine[0]
            chrom = splitLine[1]
            #startCoordinate = splitLine[2]
            #stopCoordinate = splitLine[3]
            #targetAF = splitLine[4]
            mutPosition = splitLine[5]
            #baseChange = splitLine[6]
            #coverageIn = splitLine[7]
            #coverageOut = splitLine[8]
            #actualAF = splitLine[9]
            #highestAF = splitLine[10]
            
            if (chrom + "_" + mutPosition) in outputDict:
                logging.debug(line + outputDict[chrom + "_" + mutPosition])
                
            # add the coordinate to the output
            outputDict[chrom + "_" + mutPosition] = line
            
            # keep track of the number of total events per file 
            aStatsDict[aPrefix + "_events"] += 1
            
            # all events are considered passing events
            aStatsDict[aPrefix + "_pass_events"] += 1
            
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
                            break;
                        
    inputFileHandler.close()
        
    return (outputDict, aStatsDict)
Ejemplo n.º 22
0
def get_vcf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    '''
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
     
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("VCF Line: %s", line)
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
                
        # these lines are from previous scripts in the pipeline, so output them    
        elif (line.startswith("#")):
            continue;
        
        # now we are to the data
        else:    
            
            # split the line on the tab
            splitLine = line.split("\t")
            
            # get the fields to yield
            #columnHeaders = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
            chrom = splitLine[0]
            stopCoordinate = splitLine[1]
             
            #outputDict[chrom + "_" + stopCoordinate] = line
                    
            # keep track of the number of total events per file 
            #aStatsDict[aPrefix + "_events"] += 1
            
            #if ("PASS" in line):
            #    aStatsDict[aPrefix + "_pass_events"] += 1
                    
            # the thing that is being compared to has to have the smaller/limited amount
            # i.e. only the passing som events, otherwise everything will be found
            if (aPrefix == "rad" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            #if (aPrefix == "rad" and "SNP" in line):
            #if (aPrefix == "rad"):
                # add the coordinate to the output
                outputDict[chrom + "_" + stopCoordinate] = line
            #elif (aPrefix == "cmp" and "PASS" in line):
            elif (aPrefix == "cmp" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            #elif (aPrefix == "cmp" and "SNP" in line):
                outputDict[chrom + "_" + stopCoordinate] = line 
            
            #if ("PASS" in line and "Somatic" in line and "SNP" in line):
            #if ("SOM" in line):
            #    outputDict[chrom + "_" + stopCoordinate] = line
                    
            #    # keep track of the number of total events per file 
            #    aStatsDict[aPrefix + "_events"] += 1
                
            #    if ("PASS" in line):
            #        aStatsDict[aPrefix + "_pass_events"] += 1
            
            #if ("PASS" in line and "SOM" in line):
            #if ("SOM" in line):
            #    outputDict[chrom + "_" + stopCoordinate] = line 
            #    aStatsDict[aPrefix + "_events"] += 1
            #    aStatsDict[aPrefix + "_pass_events"] += 1
            
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # there can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "rad"):
                    # break up the string to get the individual keys
                    radKeyList = radKeyString.split(",")
                    # search for each one of them
                    for radKey in radKeyList:
                        # if we find one
                        if (radKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + radKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((radKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and ((radKey == "Germline") or ("DB" not in line))):
                            if ("PASS" in line and "SNP" in line and radKey in line):
                            #if ("PASS" in line):
                            #if (radKey in line):
                            #if ("PASS" in line and "SNP" in line):
                                aStatsDict[aPrefix + "_pass_" + radKeyString] += 1
                            # only count it once
                            break;
                        
                elif (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            #if ("SNP" in line):
                            #    aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((cmpKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and "SNP" in line and ((cmpKey == "Germline") or ("DB" not in line))):
                            #if ("PASS" in line and "SNP" in line and "SS=2" in line and cmpKey in line):
                            if ("PASS" in line and "SNP" in line and cmpKey in line):
                            #if ("PASS" in line):
                                aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
                            break;
                        
    inputFileHandler.close()
        
    return (outputDict, aStatsDict)
Ejemplo n.º 23
0
def get_validation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    '''
    ' The validation files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    '
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
     
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        
        #if (anIsDebug):
        #    logging.debug("Validation Line: %s", line)
            
        # if it is an empty line, then just continue
        if (line.isspace()):
            continue;
                
        # these lines are from previous scripts in the pipeline, so skip them    
        elif (line.startswith("#")):
            continue;

        # this is a header line, so skip it   
        elif (line.startswith("chrom")):
            continue;
        
        # now we are to the data
        else:    
            
            # split the line on the tab
            splitLine = line.split("\t")
            
            # get the fields to yield
            # columnHeaders = ["chrom", "chr_start", "chr_stop", "ref", "var", "source", "val_result"]
            # these are 0-based
            chrom = splitLine[0]
            #startCoordinate = splitLine[1]
            stopCoordinate = splitLine[2]
            #ref = splitLine[3]
            #variantAllele = splitLine[4]
            #center = splitLine[5]
            #valResult = splitLine[6]
            
            # add the coordinate to the output
            outputDict[chrom + "_" + stopCoordinate] = line
            
            # keep track of the number of total events per file 
            aStatsDict[aPrefix + "_events"] += 1
            
            # all events are considered passing events
            aStatsDict[aPrefix + "_pass_events"] += 1
            
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            if (cmpKey in line):
                                aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
                            break;
                        
    inputFileHandler.close()
        
    return (outputDict, aStatsDict)
Ejemplo n.º 24
0
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename,
                  anOutputFilename, aFilterName, aFilterField,
                  anIncludeOverlapInfo, anIncludeFilterName, anIdField,
                  anIncludeId, anIncludeCount, aFilterHeaderLine,
                  aBinSize, anIsDebug):
    '''
    ' This function reads from a .bed file and a .vcf file line by line and
    ' looks for variants that should be filtered or tagged. The .bed file
    ' specifies coordinates for areas where variants should either be included
    ' or excluded.  For example, a .bed file specifying transcription or exon
    ' start and stop coordinates can be provided along with the
    ' --includeOverlaps flag to indicate that the variants in these regions
    ' should be kept, and variants outside of these regions should be flagged
    ' or filtered out.  Conversely, a bed file specifying areas of the genome
    ' that are accessible (as defined by the 1000 Genomes project) can be given
    ' without the --includeOverlaps flag to indicate that the variants outside
    ' of the accessible genome should be flagged or filtered out, and variants
    ' overlapping the accessible regions should not be flagged or filtered out.
    '
    ' aTCGAId: The TCGA Id for this sample
    ' aChrom: The chromosome being filtered
    ' aBedFilename: A .bed file with at least 3 columns specifying the chrom,
    '    start, and stop coordinates and possibly a 4th column with an id
    ' aVCFFilename: A .vcf file with variants that will be either
    '    included or excluded
    ' anOutputFilename: An output file where the filtered variants are output
    ' aFilterName: The name of the filter
    ' aFilterField: The field where the filter name should be included
    '    (e.g. INFO or FILTER)
    ' anIncludeOverlapInfo: A flag specifying whether the variants should be
    '    included or excluded when they overlap
    ' anIncludeFilterName: A flag specifying whether the filtering name should
    '    be included in the output or not
    ' anIdField: The field where the ID should be specified (e.g. ID or INFO)
    ' anIncludeId: A flag specifying whether the id should be included in the
    '    output or not
    ' anIncludeCount: A flag specifying whether the number of overlaps should
    '    be included in the output or not
    ' aFilterHeaderLine: A filter header line that should be added to the VCF
    '    header describing this filter
    ' aBinSize:  The size of the interval between each bin
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # initialize pybed with the filtering file
    filterPybed = pybed(binsize=aBinSize)
    filterPybed.load_from_file(aBedFilename)

    # get the vcf file
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename)

    # get the output file
    i_outputFileHandler = None
    if (anOutputFilename is not None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename)

    # create the generator for the vcf file
    vcfGenerator = get_vcf_data(i_vcfFileHandler,
                                i_outputFileHandler,
                                aFilterHeaderLine,
                                anIsDebug)

    # initialize some variables
    overlappingEvents = 0
    nonOverlappingEvents = 0
    totalEvents = 0
    startTime = time.time()

    # for each vcf line
    for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate,
         vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter,
         vcf_info, vcf_restLine, vcf_line) in (vcfGenerator):

        totalEvents += 1

        if (anIsDebug):
            logging.debug("VCF: %s", vcf_line)

        # check if this vcf coordinate overlaps with the filter coordinates
        posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate)
        (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple,
                                                                anIncludeCount)

        # if an event overlaps with the filters
        if (isOverlap):
            # count the overlap
            overlappingEvents += 1

            # if we want to add info about overlaps
            if (anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(vcf_filter,
                                                        vcf_info,
                                                        aFilterName,
                                                        aFilterField,
                                                        anIncludeCount,
                                                        count,
                                                        anIncludeId,
                                                        anIdField,
                                                        idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we don't want to add info about overlaps, just output them
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line
        # these events don't overlap with the filters
        else:
            # count the non overlap
            nonOverlappingEvents += 1

            # if we don't want to add info about overlaps,
            # then we do want to add info about non-overlaps
            if (not anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(vcf_filter,
                                                        vcf_info,
                                                        aFilterName,
                                                        aFilterField,
                                                        anIncludeCount,
                                                        count,
                                                        anIncludeId,
                                                        anIdField,
                                                        idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we do want to add info about overlaps,
            # so just output non-overlaps
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line

    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs",
                 aChrom, aTCGAId, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    if (overlappingEvents + nonOverlappingEvents == totalEvents):
        logging.info("For chrom %s and Id %s: %s (overlapping events) + " +
                     "%s (non-overlapping events) = %s", aChrom, aTCGAId,
                     overlappingEvents, nonOverlappingEvents, totalEvents)
    else:
        logging.info("filterByPybed Warning: For chrom %s and Id %s: %s " +
                     "(overlapping events) + %s (non-overlapping events) = %s",
                     aChrom, aTCGAId, overlappingEvents,
                     nonOverlappingEvents, totalEvents)

    # close the files
    i_vcfFileHandler.close()
    if (anOutputFilename is not None):
        i_outputFileHandler.close()
    return
Ejemplo n.º 25
0
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    '''
    ' This function parses the output from BLAT.  Two formats are supported:
    ' BLAST NCBI-8 and PSL.  It groups all of the information from one query
    ' sequence and uses the python generator to yield the information.  It
    ' ignores empty lines and strips trailing \r\n characters.
    '
    ' aBlatFile:         A output file from BLAT
    ' anOutputFormat:    BLAST or PSL
    ' anIsDebug:         A flag for outputting debug messages to STDERR
    '''

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(list)
    previousPrefix = ""

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):
            continue

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data =
        # rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            # the PSL output has a bunch of header lines that we want to skip
            # if the first column can't be converted into an int, then skip
            try:
                int(splitLine[0])
            except ValueError:
                continue
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        prefix = "_".join(blatId.split("_")[0:3])
        readId = "_".join(blatId.split("_")[0:4])

        # this catches all of the matches except the first one
        if (prefix == previousPrefix):
            blatHitsDict[readId].append(line)
            '''
            if (anIsDebug):
                logging.debug("prefixes match, current=%s, prev=%s",
                              prefix, previousPrefix)
            '''
        # if the prefixes don't match and the blatHitsDict is not empty:
        # we've reached a new set of blat hits, so yield the previous ones
        elif blatHitsDict:
            '''
            if (anIsDebug):
                logging.debug("new prefix=%s, prev=%s", prefix, previousPrefix)
                logging.debug("yielding len blatHits=%s", len(blatHitsDict))
            '''
            # yield the blat hits for this prefix
            yield blatHitsDict
            # clear the blat hits dict for the next matches
            blatHitsDict.clear()
            # set the previous prefix and start filling
            # the dict with the first prefix
            previousPrefix = prefix
            blatHitsDict[readId].append(line)
            '''
            if (anIsDebug):
                logging.debug("after yield current=%s, prev=%s",
                              prefix, previousPrefix)
            '''
        # if the prefixes don't match, and the blatHitsDict is empty:
        # this is the first line of the VCF, set the previous prefix
        # and add it to the blatHitsDict
        else:
            blatHitsDict[readId].append(line)
            previousPrefix = prefix

    # this one is needed to yield the very last blatHitsDict when all
    # lines of the VCF have been processed
    yield blatHitsDict
    return
Ejemplo n.º 26
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Opts]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o", "--outputFilename", default=sys.stdout,
        dest="outputFilename", metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c", "--allVCFCalls", action="store_false", default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
             "thus far are processed, include this argument if all of the " +
             "VCF calls should be processed")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename is not None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not sys.stdout):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename,
                          i_rnaGeneFilename,
                          i_rnaGeneFamilyFilename]

    if (not radiaUtil.check_for_argv_errors(None,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the input stream
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename,
                                          i_debug)

    hasAddedFilterHeader = False

    for line in i_vcfFileHandler:

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedFilterHeader) and (line.startswith("##FILTER"))):
            hasAddedFilterHeader = True
            i_outputFileHandler.write(
                "##FILTER=<ID=rgene,Description=\"This gene is on the " +
                "RNA gene blacklist\">\n")
            i_outputFileHandler.write(
                "##FILTER=<ID=rgfam,Description=\"This gene family is on " +
                "the RNA gene family blacklist\">\n")
            i_outputFileHandler.write(line)

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            i_outputFileHandler.write(line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            i_outputFileHandler.write(line)

        # now we are to the data
        else:

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                # effectImpact = effectParts[0]
                # functionalClass = effectParts[1]
                # codonChange = effectParts[2]
                # aaChange = effectParts[3]
                # aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                # geneCoding = effectParts[7]
                # ensembleId = effectParts[8]
                # exonNumber = effectParts[9]
                # genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename is not sys.stdout):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Ejemplo n.º 27
0
def get_vcf_data(anId, anInputDir, anIsDebug):

    # for each file that starts with this id
    # load the first file to get the header
    # get the coordinates for all

    processedHeader = False
    headerDict = dict()
    headerDict["metadata"] = list()
    headerDict["format"] = list()
    headerDict["info"] = list()
    headerDict["filter"] = list()
    headerDict["chrom"] = list()
    coordinateDict = dict()
    coordinateDict["numbers"] = dict()
    coordinateDict["letters"] = dict()

    # if the input directory doesn't end with a forward slash,
    # then add one so that glob.glob will work
    if (not anInputDir.endswith("/")):
        anInputDir = anInputDir + "/"

    # for each vcf file
    # they might be gzipped, they might not
    for vcfFile in (glob.glob(anInputDir + anId + "_chr*.vcf*")):

        # open the file
        vcfFileHandler = radiaUtil.get_read_fileHandler(vcfFile)

        for line in vcfFileHandler:

            # if it is an empty line, then just continue
            if (line.isspace()):
                continue

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug):
                logging.debug("vcfLine: %s", line)

            # if we haven't processed the header yet, then do it here
            if (not processedHeader):
                # extract the metadata
                if (line.startswith("##FORMAT")):
                    headerDict["format"].append(line)
                elif (line.startswith("##INFO")):
                    headerDict["info"].append(line)
                elif (line.startswith("##FILTER")):
                    headerDict["filter"].append(line)
                elif (line.startswith("##")):
                    headerDict["metadata"].append(line)
                elif (line.startswith("#CHROM")):
                    headerDict["chrom"].append(line)
                    # now we've processed the header
                    processedHeader = True

            if (line.startswith("#")):
                continue
            else:
                # split the line on the tab
                splitLine = line.split("\t")

                # the coordinate is the second element
                chrom = splitLine[0]

                # we want to sort everything at the end, so keep track
                # of the chroms that are numbers and letters separately
                if (is_number(chrom)):
                    if chrom not in coordinateDict["numbers"]:
                        coordinateDict["numbers"][chrom] = list()
                    coordinateDict["numbers"][chrom].append(line)
                else:
                    if chrom not in coordinateDict["letters"]:
                        coordinateDict["letters"][chrom] = list()
                    coordinateDict["letters"][chrom].append(line)

        # close the file and move onto the next one
        vcfFileHandler.close()

    return (headerDict, coordinateDict)