def test_add_impact_1(self): """ Test that a maf file with is_in_IMPACT column comes out as expected """ self.maxDiff = None input_maf = os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.muts.maf") with TemporaryDirectory() as tmpdir: input_json = { "input_file": { "class": "File", "path": input_maf }, "output_filename": 'output.maf', "IMPACT_file": { "class": "File", "path": IMPACT_FILE }, } output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'IMPACT_col_added_file': { 'location': 'file://' + os.path.join(output_dir, 'output.maf'), 'basename': 'output.maf', 'class': 'File', 'checksum': 'sha1$1397fade2f877c2bcfca791407e328c5c48e6ff0', 'size': 15629589, 'path': os.path.join(output_dir, 'output.maf') } } self.assertDictEqual(output_json, expected_output) # validate output mutation file contents with open(output_json['IMPACT_col_added_file']['path']) as fin: output_maf_lines = len(fin.readlines()) self.assertEqual(output_maf_lines, 12518) input_comments, input_mutations = load_mutations(input_maf) output_comments, output_mutations = load_mutations( output_json['IMPACT_col_added_file']['path']) true_count = [row['is_in_impact'] for row in output_mutations].count('True') false_count = [row['is_in_impact'] for row in output_mutations].count('False') self.assertTrue(true_count == 8367) self.assertTrue(false_count == 4147) # check that its got two extra columns in the output self.assertTrue( len(input_mutations[1]) + 2 == len(output_mutations[1]))
def test_add_af(self): """ Test IMPACT CWL with tiny dataset """ maf_lines = [ ['# comment 1'], ['# comment 2'], ['Hugo_Symbol', 't_depth', 't_alt_count'], ['SUFU', '100', '75'], ['GOT1', '100', '1'], ['SOX9', '100', '0'], ] with TemporaryDirectory() as tmpdir: input_maf = write_table(tmpdir=tmpdir, filename='input.maf', lines=maf_lines) input_json = { "input_file": { "class": "File", "path": input_maf }, "output_filename": 'output.maf', } output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'output_file': { 'location': 'file://' + os.path.join(output_dir, 'output.maf'), 'basename': 'output.maf', 'class': 'File', 'checksum': 'sha1$39de59ad5d736db692504012ce86d3395685112e', 'size': 109, 'path': os.path.join(output_dir, 'output.maf') } } self.assertDictEqual(output_json, expected_output) comments, mutations = load_mutations( output_json['output_file']['path']) expected_comments = ['# comment 1', '# comment 2'] self.assertEqual(comments, expected_comments) expected_mutations = [{ 'Hugo_Symbol': 'SUFU', 't_depth': '100', 't_alt_count': '75', 't_af': '0.75' }, { 'Hugo_Symbol': 'GOT1', 't_depth': '100', 't_alt_count': '1', 't_af': '0.01' }, { 'Hugo_Symbol': 'SOX9', 't_depth': '100', 't_alt_count': '0', 't_af': '0.0' }] self.assertEqual(mutations, expected_mutations)
def test_add_impact_0(self): """ Test IMPACT CWL with tiny dataset """ maf_lines = [['# comment 1'], ['# comment 2'], ['Hugo_Symbol'], ['SUFU'], ['GOT1'], ['BRCA']] impact_lines = [['BRCA', 'IMPACT468'], ['SUFU', 'IMPACT468'], ['SUFU', 'IMPACT505']] with TemporaryDirectory() as tmpdir: input_maf = write_table(tmpdir=tmpdir, filename='input.maf', lines=maf_lines) impact_file = write_table(tmpdir=tmpdir, filename='impact.txt', lines=impact_lines) input_json = { "input_file": { "class": "File", "path": input_maf }, "output_filename": 'output.maf', "IMPACT_file": { "class": "File", "path": impact_file }, } output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'IMPACT_col_added_file': { 'location': 'file://' + os.path.join(output_dir, 'output.maf'), 'basename': 'output.maf', 'class': 'File', 'checksum': 'sha1$5c61f3977dad29ebc74966e8fc40a0278f9aab12', 'size': 126, 'path': os.path.join(output_dir, 'output.maf') } } self.assertDictEqual(output_json, expected_output) comments, mutations = load_mutations( output_json['IMPACT_col_added_file']['path']) expected_comments = ['# comment 1', '# comment 2'] self.assertEqual(comments, expected_comments) expected_mutations = [{ 'Hugo_Symbol': 'SUFU', 'is_in_impact': 'True', 'impact_assays': 'IMPACT468,IMPACT505' }, { 'Hugo_Symbol': 'GOT1', 'is_in_impact': 'False', 'impact_assays': '.' }, { 'Hugo_Symbol': 'BRCA', 'is_in_impact': 'True', 'impact_assays': 'IMPACT468' }] self.assertEqual(mutations, expected_mutations)
def test_run_worflow_mixed_mafs(self): """ Test that the workflow works correctly when run with a mix of Argos muts.maf files and Facets Suite annotated maf files The Facets Suite maf files have extra columns that need to be retained in the output """ input_json = { "is_impact": True, "argos_version_string": "2.x", "analysis_gene_cna_filename": "Proj_08390_G.gene.cna.txt", "analysis_mutations_filename": "Proj_08390_G.muts.maf", "analysis_mutations_share_filename": "Proj_08390_G.muts.share.maf", "analysis_segment_cna_filename": "Proj_08390_G.seg.cna.txt", "analysis_sv_filename": "Proj_08390_G.svs.maf", "helix_filter_version": "20.06.1", "IMPACT_gene_list": { "class": "File", "path": IMPACT_FILE }, "targets_list": { "path": DATA_SETS['Proj_08390_G']["targets_list"], "class": "File" }, "mutation_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.muts.maf"), "class": "File" }, { "path": os.path.join(DATA_SETS['Proj_08390_G']['FACETS_SUITE_DIR'], "Sample4.Sample3_hisens.ccf.portal.maf"), "class": "File" }], "mutation_svs_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.svs.pass.vep.maf"), "class": "File" }, { "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample4.Sample3.svs.pass.vep.maf"), "class": "File" }], "facets_hisens_cncf_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.cncf.txt" ), "class": "File" }, { "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample3.rg.md.abra.printreads__Sample4.rg.md.abra.printreads_hisens.cncf.txt" ), "class": "File" }], "facets_hisens_seg_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.seg" ), "class": "File" }, { "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample3.rg.md.abra.printreads__Sample4.rg.md.abra.printreads_hisens.seg" ), "class": "File" }] } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'analysis_dir': { 'class': 'Directory', 'basename': 'analysis', 'location': 'file://' + os.path.join(output_dir, 'analysis'), 'path': os.path.join(output_dir, 'analysis'), 'listing': [{ 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.gene.cna.txt'), 'basename': 'Proj_08390_G.gene.cna.txt', 'class': 'File', 'checksum': 'sha1$ab17d587ad5ae0a87fd6c6d4dd2d5d1701208ce9', 'size': 173982, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.gene.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.muts.maf'), 'basename': 'Proj_08390_G.muts.maf', 'class': 'File', 'checksum': 'sha1$66a87cb8cc2eea31f490852d468bedd958c4ecc5', 'size': 59915, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.maf') }, { 'location': 'file://' + os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf'), 'basename': 'Proj_08390_G.muts.share.maf', 'class': 'File', 'checksum': 'sha1$cbaa23bb848978cde135efd3870db8f35b3f2861', 'size': 10729, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.seg.cna.txt'), 'basename': 'Proj_08390_G.seg.cna.txt', 'class': 'File', 'checksum': 'sha1$f6a77b280c047a7e2082e3a09e8138f861790d3a', 'size': 3191, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.seg.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.svs.maf'), 'basename': 'Proj_08390_G.svs.maf', 'class': 'File', 'checksum': 'sha1$5c2a63fc01980550108e58079a8b689d53c97d8c', 'size': 35595, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.svs.maf') }] } } self.maxDiff = None self.assertDictEqual(output_json, expected_output) comments, mutations = load_mutations( os.path.join(output_dir, 'analysis', 'Proj_08390_G.muts.maf')) self.assertEqual(len(mutations), 34) colnames = mutations[0].keys() some_required_colnames = [ "ASCN.TOTAL_COPY_NUMBER", "ASCN.MINOR_COPY_NUMBER", "ASCN.EXPECTED_ALT_COPIES", "ASCN.CCF_EXPECTED_COPIES", "ASCN.CCF_EXPECTED_COPIES_LOWER", "ASCN.CCF_EXPECTED_COPIES_UPPER", "ASCN.ASCN_METHOD", "ASCN.ASCN_INTEGER_COPY_NUMBER" ] for colname in some_required_colnames: self.assertTrue( colname in colnames, "Column label {} not present in the mutation file. Missing columns: {}" .format(colname, [ c for c in some_required_colnames if c not in colnames ])) self.assertEqual(mutations[0]['t_af'], '0.42953020134228187') self.assertEqual(mutations[0]['is_in_impact'], 'True') self.assertEqual(mutations[0]['impact_assays'], 'IMPACT341,IMPACT410,IMPACT468,IMPACT505')
def test_run_worflow_one_maf(self): """ Test that the workflow works correctly when run with a single maf """ input_json = { "is_impact": True, "argos_version_string": "2.x", "analysis_gene_cna_filename": "Proj_08390_G.gene.cna.txt", "analysis_mutations_filename": "Proj_08390_G.muts.maf", "analysis_mutations_share_filename": "Proj_08390_G.muts.share.maf", "analysis_segment_cna_filename": "Proj_08390_G.seg.cna.txt", "analysis_sv_filename": "Proj_08390_G.svs.maf", "helix_filter_version": "20.06.1", "IMPACT_gene_list": { "class": "File", "path": IMPACT_FILE }, "targets_list": { "path": DATA_SETS['Proj_08390_G']["targets_list"], "class": "File" }, "mutation_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.muts.maf"), "class": "File" }], "mutation_svs_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.svs.pass.vep.maf"), "class": "File" }], "facets_hisens_cncf_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.cncf.txt" ), "class": "File" }], "facets_hisens_seg_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.seg" ), "class": "File" }], } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'analysis_dir': { 'class': 'Directory', 'basename': 'analysis', 'location': 'file://' + os.path.join(output_dir, 'analysis'), 'path': os.path.join(output_dir, 'analysis'), 'listing': [{ 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.gene.cna.txt'), 'basename': 'Proj_08390_G.gene.cna.txt', 'class': 'File', 'checksum': 'sha1$7cc89d24556de93b9a409812317581e67e5df494', 'size': 87905, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.gene.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.muts.maf'), 'basename': 'Proj_08390_G.muts.maf', 'class': 'File', 'checksum': 'sha1$2c8904927a917d6e935ef207582d995680574d16', 'size': 33243, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.maf') }, { 'location': 'file://' + os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf'), 'basename': 'Proj_08390_G.muts.share.maf', 'class': 'File', 'checksum': 'sha1$b5af4e0fcd89fecabf8095aa3d7690e5edb8dca1', 'size': 7462, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.seg.cna.txt'), 'basename': 'Proj_08390_G.seg.cna.txt', 'class': 'File', 'checksum': 'sha1$f0ebb82c34b6530447fa1e70b6dedcc039840d61', 'size': 1632, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.seg.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.svs.maf'), 'basename': 'Proj_08390_G.svs.maf', 'class': 'File', 'checksum': 'sha1$df420706bb5b772a79317843c0a01a3c88a9571d', 'size': 23603, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.svs.maf') }] } } self.maxDiff = None self.assertDictEqual(output_json, expected_output) comments, mutations = load_mutations( os.path.join(output_dir, 'analysis', 'Proj_08390_G.muts.maf')) self.assertEqual(len(mutations), 22)
def test_run_worflow_two_mafs(self): """ Test that the workflow works correctly when run with two maf files """ input_json = { "is_impact": True, "argos_version_string": "2.x", "analysis_gene_cna_filename": "Proj_08390_G.gene.cna.txt", "analysis_mutations_filename": "Proj_08390_G.muts.maf", "analysis_mutations_share_filename": "Proj_08390_G.muts.share.maf", "analysis_segment_cna_filename": "Proj_08390_G.seg.cna.txt", "analysis_sv_filename": "Proj_08390_G.svs.maf", "helix_filter_version": "20.06.1", "IMPACT_gene_list": { "class": "File", "path": IMPACT_FILE }, "targets_list": { "path": DATA_SETS['Proj_08390_G']["targets_list"], "class": "File" }, "mutation_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.muts.maf"), "class": "File" }, { "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample4.Sample3.muts.maf"), "class": "File" }], "mutation_svs_maf_files": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample1.Sample2.svs.pass.vep.maf"), "class": "File" }, { "path": os.path.join(DATA_SETS['Proj_08390_G']['MAF_DIR'], "Sample4.Sample3.svs.pass.vep.maf"), "class": "File" }], "facets_hisens_cncf_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.cncf.txt" ), "class": "File" }, { "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample3.rg.md.abra.printreads__Sample4.rg.md.abra.printreads_hisens.cncf.txt" ), "class": "File" }], "facets_hisens_seg_files": [{ "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample2.rg.md.abra.printreads__Sample1.rg.md.abra.printreads_hisens.seg" ), "class": "File" }, { "path": os.path.join( DATA_SETS['Proj_08390_G']['FACETS_DIR'], "Sample3.rg.md.abra.printreads__Sample4.rg.md.abra.printreads_hisens.seg" ), "class": "File" }] } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'analysis_dir': { 'class': 'Directory', 'basename': 'analysis', 'location': 'file://' + os.path.join(output_dir, 'analysis'), 'path': os.path.join(output_dir, 'analysis'), 'listing': [{ 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.gene.cna.txt'), 'basename': 'Proj_08390_G.gene.cna.txt', 'class': 'File', 'checksum': 'sha1$ab17d587ad5ae0a87fd6c6d4dd2d5d1701208ce9', 'size': 173982, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.gene.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.muts.maf'), 'basename': 'Proj_08390_G.muts.maf', 'class': 'File', 'checksum': 'sha1$d4352ee2b702877b84db2b632972ccad2441f3e0', 'size': 54458, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.maf') }, { 'location': 'file://' + os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf'), 'basename': 'Proj_08390_G.muts.share.maf', 'class': 'File', 'checksum': 'sha1$086ce6517eae68e47160c8740c5f00d7c3454110', 'size': 10956, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.muts.share.maf') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.seg.cna.txt'), 'basename': 'Proj_08390_G.seg.cna.txt', 'class': 'File', 'checksum': 'sha1$f6a77b280c047a7e2082e3a09e8138f861790d3a', 'size': 3191, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.seg.cna.txt') }, { 'location': 'file://' + os.path.join( output_dir, 'analysis/Proj_08390_G.svs.maf'), 'basename': 'Proj_08390_G.svs.maf', 'class': 'File', 'checksum': 'sha1$5c2a63fc01980550108e58079a8b689d53c97d8c', 'size': 35595, 'path': os.path.join(output_dir, 'analysis/Proj_08390_G.svs.maf') }] } } self.maxDiff = None self.assertDictEqual(output_json, expected_output) comments, mutations = load_mutations( os.path.join(output_dir, 'analysis', 'Proj_08390_G.muts.maf')) self.assertEqual(len(mutations), 34)