Exemple #1
0
    def testProteinChange(self):
        """ Test that protein change parsing of start and end works.
        """
        # Each tuple is test, ground truth
        testInOuts = [
            ("p.K128_R130del", ['128','130']),
            ("p.W274G", ["274", "274"]),
            ("p.13_14AA>A", ["13", "14"]),
            ("p.G25_splice", ["25", "25"]),
            ("p.E813*", ["813", "813"]),
            ("p.SLPQPEQRPY59del", ["59", "59"])
        ]

        ctr = 1
        for test in testInOuts:
            result = MutUtils.extractProteinPosition(test[0])
            self.assertTrue(result != ['', ''], "Result was empty.  " + str(test[0]) + ".  ")
            self.assertTrue(result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ".  " + str(result) + "  GT: " + str(test[1]))
            ctr += 1
        self.assertTrue(MutUtils.extractProteinPosition("blahblah") == ['', ''])
Exemple #2
0
    def testProteinChange(self):
        """ Test that protein change parsing of start and end works.
        """
        # Each tuple is test, ground truth
        testInOuts = [("p.K128_R130del", ['128', '130']),
                      ("p.W274G", ["274", "274"]),
                      ("p.13_14AA>A", ["13", "14"]),
                      ("p.G25_splice", ["25", "25"]),
                      ("p.E813*", ["813", "813"]),
                      ("p.SLPQPEQRPY59del", ["59", "59"])]

        ctr = 1
        for test in testInOuts:
            result = MutUtils.extractProteinPosition(test[0])
            self.assertTrue(result != ['', ''],
                            "Result was empty.  " + str(test[0]) + ".  ")
            self.assertTrue(
                result[0] == test[1][0] and result[1] == test[1][1],
                "Result did not match for " + str(test[0]) + ".  " +
                str(result) + "  GT: " + str(test[1]))
            ctr += 1
        self.assertTrue(
            MutUtils.extractProteinPosition("blahblah") == ['', ''])
Exemple #3
0
    def indexGeneProteinPosition(geneColumn, proteinInfoColumn, inputFilename, outputFilename):
        """
        Creates an intermediate temporary file that includes two additional columns, startAA and endAA,
        sorts the file, writes thee sorted file to outputFilename, and then indexes the sorted file.

        :param geneColumn: name of the gene column in the inputFilename
        :param proteinInfoColumn: name of the protein change or position column. Can be of formats: p.K128_R130del
        (position 128 through 130) For more examples, see MutUtilsTest.testProteinChange()
        :param inputFilename: input tsv filename
        :param outputFilename: output filename
        """
        startAACol = "startAA"
        endAACol = "endAA"

        # Create intermediate file.  Do not use '#' for comments, since header can start with '#'
        tsvReader = GenericTsvReader(inputFilename, commentPrepend=";")

        # These are the outputHeaders for the intermediate file.
        headers = tsvReader.getFieldNames()

        if startAACol not in headers:
            headers += [startAACol]
        if endAACol not in headers:
            headers += [endAACol]

        # Write to the intermediate temporary file.
        # This file is created in the current working directory."
        temp = tempfile.NamedTemporaryFile()
        csvfile = file(temp.name, 'w')

        # Initialize the intermediate file's header.
        tsvWriter = csv.DictWriter(csvfile, headers, delimiter='\t', lineterminator='\n')
        # If the headers have a leading '#', get rid of it.
        for i in range(0, len(headers)):
            header = headers[i]
            if header.startswith("#"):
                headers[i] = header.replace("#", "")
        tsvWriter.writeheader()

        # Get indices of relevant columns.
        gene_i = headers.index(geneColumn)
        startAA_i = headers.index(startAACol)
        endAA_i = headers.index(endAACol)

        # Write each line of the intermediate file.
        for row in tsvReader:
            protein = row[proteinInfoColumn]
            if protein is None or not protein.strip():
                continue
            [startAA, endAA] = MutUtils.extractProteinPosition(protein)
            if not startAA.strip() or not endAA.strip():
                continue
            row[startAACol] = startAA
            row[endAACol] = endAA
            tsvWriter.writerow(row)
        csvfile.flush()
        csvfile.close()

        # Sort the intermediate tsv file.
        tsvSorter = TsvFileSorter(temp.name)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))

        # Use the whole file path name.
        outputFilename = os.path.abspath(outputFilename)
        tsvSorter.sortFile(outputFilename, func)

        return TabixIndexer.index(destDir=os.path.dirname(os.path.abspath(outputFilename)),
                                  inputFilename=outputFilename, fileColumnNumList=[gene_i, startAA_i, endAA_i])