def testSortFileWithSpaces(self): """ Tests sorting a file with spaces in the headers on the filesystem. """ inputFilename = os.path.join(*["testdata", "small_cosmic_with_gp_and_gpp", "small_cosmic_trimmed_for_sorting.txt.tbi.byAA"]) outputFilename = os.path.join("out", "small_cosmic_trimmed_for_sorting.txt.byAA.sorted.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
def testSortFile(self): """ Tests sorting a file on the filesystem. """ inputFilename = os.path.join(*["testdata", "small_cosmic_gpp", "small_cosmic_gpp.tempForSorting.tsv"]) outputFilename = os.path.join("out", "small_cosmic_gpp.tempForSorting.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ((val["Gene_name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
def testCallbackExceptionIncorrectType(self): """ Tests that the CallbackException is raised when the input anonymous function does not return a tuple given a row. """ inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join("out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: (val["Gene name"]).lower() try: tsvFileSorter.sortFile(outputFilename, func, 3) except CallbackException as msg: self.assertTrue(msg.value == "The value returned by the callback must be a tuple. Instead, a value of " "<type 'str'> was returned.", "Error msg is different.")
def getSortedTsvFilename(self, path): """ :param path: :return: """ chrom2HashCode = MutUtils.createChrom2HashCodeTable(self.chroms) tsvFileSorter = TsvFileSorter(self.filename) sortedTempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False) func = lambda val: (chrom2HashCode[val["chr"]], int(val["start"]), val["alt_allele"]) tsvFileSorter.sortFile(sortedTempTsvFile.name, func) os.remove(self.filename) return sortedTempTsvFile.name
def getSortedTsvFilename(self, path): """ :param path: :return: """ chrom2HashCode = MutUtils.createChrom2HashCodeTable(self.chroms) tsvFileSorter = TsvFileSorter(self.filename) sortedTempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False) func = lambda val: (chrom2HashCode[val["chr"]], int(val["start"]), val["alt_allele"]) self.logger.debug("Sorting tmp tsv %s->%s", self.filename, sortedTempTsvFile.name) tsvFileSorter.sortFile(sortedTempTsvFile.name, func) os.remove(self.filename) return sortedTempTsvFile.name
def testSortMixedCaps(self): """ Tests sorting a file with mixed capitalization in the reference column. """ inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join("out", "sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.") guessmd5 = hashlib.md5(file(outputFilename, 'r').read()).hexdigest() gtmd5 = hashlib.md5(file(os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv"]), "r").read()).hexdigest() self.assertTrue(guessmd5 == gtmd5)
def testMultiplePartitionSorting(self): """ Tests that the sorting works when the partition size is small and input file must be broken into multiple partitions. """ inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join("out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func, 3) self.assertTrue(os.path.exists(outputFilename), "No file was generated.") guessmd5 = hashlib.md5(file(outputFilename, "r").read()).hexdigest() gtmd5 = hashlib.md5(file(os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv"]), "r").read()).hexdigest() self.assertTrue(guessmd5 == gtmd5)
def testSortFileWithSpaces(self): """ Tests sorting a file with spaces in the headers on the filesystem. """ inputFilename = os.path.join(*[ "testdata", "small_cosmic_with_gp_and_gpp", "small_cosmic_trimmed_for_sorting.txt.tbi.byAA" ]) outputFilename = os.path.join( "out", "small_cosmic_trimmed_for_sorting.txt.byAA.sorted.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ( (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
def testSortFile(self): """ Tests sorting a file on the filesystem. """ inputFilename = os.path.join(*[ "testdata", "small_cosmic_gpp", "small_cosmic_gpp.tempForSorting.tsv" ]) outputFilename = os.path.join( "out", "small_cosmic_gpp.tempForSorting.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ( (val["Gene_name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
def testCallbackExceptionIncorrectType(self): """ Tests that the CallbackException is raised when the input anonymous function does not return a tuple given a row. """ inputFilename = os.path.join( *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join( "out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: (val["Gene name"]).lower() try: tsvFileSorter.sortFile(outputFilename, func, 3) except CallbackException as msg: self.assertTrue( msg.value == "The value returned by the callback must be a tuple. Instead, a value of " "<type 'str'> was returned.", "Error msg is different.")
def testSortMixedCaps(self): """ Tests sorting a file with mixed capitalization in the reference column. """ inputFilename = os.path.join( *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join("out", "sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ( (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func) self.assertTrue(os.path.exists(outputFilename), "No file was generated.") guessmd5 = hashlib.md5(file(outputFilename, 'r').read()).hexdigest() gtmd5 = hashlib.md5( file( os.path.join(*[ "testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv" ]), "r").read()).hexdigest() self.assertTrue(guessmd5 == gtmd5)
def testMultiplePartitionSorting(self): """ Tests that the sorting works when the partition size is small and input file must be broken into multiple partitions. """ inputFilename = os.path.join( *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"]) outputFilename = os.path.join( "out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv") tsvFileSorter = TsvFileSorter(inputFilename) func = lambda val: ( (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) tsvFileSorter.sortFile(outputFilename, func, 3) self.assertTrue(os.path.exists(outputFilename), "No file was generated.") guessmd5 = hashlib.md5(file(outputFilename, "r").read()).hexdigest() gtmd5 = hashlib.md5( file( os.path.join(*[ "testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv" ]), "r").read()).hexdigest() self.assertTrue(guessmd5 == gtmd5)
continue row['startAA'] = feature[1] row['endAA'] = feature[2] row['gene'] = m['gene'] row[annotation] = feature[3] tsvWriter.writerow(row) print("Could not get uniprot seq for " + str(numTranscriptsNotInUniprot) + " transcripts.") print("Attempted " + str(ctr) + " muts") print("Creating tabix index") print("Creating copy of tsv file (" + output_file + ") ...") tabixBasedFilename = output_file + ".copy.tsv" shutil.copyfile(output_file, tabixBasedFilename) print("Sorting ...") tsvFileSorter = TsvFileSorter(fieldNames=['gene','startAA', 'endAA']) tsvFileSorter.sortFile(tabixBasedFilename, tabixBasedFilename + ".sorted") print("Creating actual index ...") # swiss_data[key].features # For each feature, position 0 is name. # Look for "SITE" (site), "VARIANT" (natural_variation), # "COMPBIAS" or "REGION" or "DOMAIN"? (region) # create a line for each entry # Then add trembl, but only if swiss_prot has not already covered it # # Verify with old oncotator code? pass
def indexGeneProteinPosition(geneColumn, proteinInfoColumn, inputFilename, outputFilename): """ Creates an intermediate temporary file that includes two additional columns, startAA and endAA, sorts the file, writes thee sorted file to outputFilename, and then indexes the sorted file. :param geneColumn: name of the gene column in the inputFilename :param proteinInfoColumn: name of the protein change or position column. Can be of formats: p.K128_R130del (position 128 through 130) For more examples, see MutUtilsTest.testProteinChange() :param inputFilename: input tsv filename :param outputFilename: output filename """ startAACol = "startAA" endAACol = "endAA" # Create intermediate file. Do not use '#' for comments, since header can start with '#' tsvReader = GenericTsvReader(inputFilename, commentPrepend=";") # These are the outputHeaders for the intermediate file. headers = tsvReader.getFieldNames() if startAACol not in headers: headers += [startAACol] if endAACol not in headers: headers += [endAACol] # Write to the intermediate temporary file. # This file is created in the current working directory." temp = tempfile.NamedTemporaryFile() csvfile = file(temp.name, 'w') # Initialize the intermediate file's header. tsvWriter = csv.DictWriter(csvfile, headers, delimiter='\t', lineterminator='\n') # If the headers have a leading '#', get rid of it. for i in range(0, len(headers)): header = headers[i] if header.startswith("#"): headers[i] = header.replace("#", "") tsvWriter.writeheader() # Get indices of relevant columns. gene_i = headers.index(geneColumn) startAA_i = headers.index(startAACol) endAA_i = headers.index(endAACol) # Write each line of the intermediate file. for row in tsvReader: protein = row[proteinInfoColumn] if protein is None or not protein.strip(): continue [startAA, endAA] = MutUtils.extractProteinPosition(protein) if not startAA.strip() or not endAA.strip(): continue row[startAACol] = startAA row[endAACol] = endAA tsvWriter.writerow(row) csvfile.flush() csvfile.close() # Sort the intermediate tsv file. tsvSorter = TsvFileSorter(temp.name) func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) # Use the whole file path name. outputFilename = os.path.abspath(outputFilename) tsvSorter.sortFile(outputFilename, func) return TabixIndexer.index(destDir=os.path.dirname(os.path.abspath(outputFilename)), inputFilename=outputFilename, fileColumnNumList=[gene_i, startAA_i, endAA_i])