def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options['input_maf'], validation_stringency=ValidationStringency.Strict) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(['gdc_pon', 'common_in_exac']) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record['callers'].value if len(callers) >= self.options['min_callers'] and \ record['Mutation_Status'].value.value == 'Somatic': self.metrics.add_sample_swap_metric(record) gdc_filters = record['GDC_FILTER'].value gfset = set(gdc_filters) if self.is_hotspot(record): if len(gfset - hotspot_gdc_set) == 0: self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()
def load_readers(self): """ Loads the array of MafReaders and sets the callers list. """ maf_keys = [ 'mutect2', 'muse', 'vardict', 'varscan2', 'somaticsniper', 'pindel' ] for maf_key in maf_keys: if self.options[maf_key]: self.logger.info("{0} MAF {1}".format(maf_key, self.options[maf_key])) self.maf_readers.append( MafReader.reader_from( path=self.options[maf_key], validation_stringency=ValidationStringency.Strict)) self.callers.append(maf_key)
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) if options.output is None: handle = sys.stdout else: handle = open(options.output, "w") errors = ValidationErrors( options.max_errors if options.max_errors else sys.maxsize) for path in options.input: logger.info("Examining %s", path) # Gather as many errors as possible silent = ValidationStringency.Silent reader = MafReader.reader_from(path=path, validation_stringency=silent, scheme=options.scheme) if not cls.__process_errors(options, reader, logger, handle, errors): n = 0 for _ in reader: if cls.__process_errors(options, reader, logger, handle, errors): break n = n + 1 if n % Validate.print_every_n_records == 0: logger.info("Processed %d records" % n) if n == 0 or n % Validate.print_every_n_records != 0: logger.info("Processed %d records" % n) reader.close() cls.__print_report(options, errors, handle) if options.output: handle.close()
def test_with_sorting(self): scheme = TestMafWriter.TestCoordinateScheme() fd, path = tempfile.mkstemp() # Create the header header_lines = (MafHeader.scheme_header_lines(scheme) + ["#key1 value1", "#key2 value2"] + [ "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, Coordinate().name(), ) ] + ["\t".join(scheme.column_names())]) header = MafHeader.from_lines( lines=header_lines, validation_stringency=ValidationStringency.Silent) # Write the header, and the record twice writer = MafWriter.from_path( header=header, validation_stringency=ValidationStringency.Lenient, path=path, assume_sorted=False, ) writer += TestMafWriter.DummyRecord("chr1", 2, 2) writer += TestMafWriter.DummyRecord("chr1", 3, 3) writer += TestMafWriter.DummyRecord("chr1", 4, 4) writer.close() reader = MafReader.reader_from(path=path, scheme=scheme) header = reader.header() records = [rec for rec in reader] reader.close() self.assertEqual(header.sort_order().name(), Coordinate.name()) self.assertListEqual([r["Start_Position"].value for r in records], [2, 3, 4]) self.assertListEqual([r["End_Position"].value for r in records], [2, 3, 4])
def test_reader_out_of_order(self): column_names = ["Chromosome", "Start_Position", "End_Position"] scheme = NoRestrictionsScheme(column_names) header_version = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, scheme.version(), ) header_sort_order = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, Coordinate(), ) lines = [ header_version, header_sort_order, "\t".join(column_names), "\t".join(["A", "1", "1"]), "\t".join(["A", "4", "4"]), "\t".join(["A", "2", "2"]), ] fh, fn = tmp_file(lines=lines) fh.close() reader = MafReader.reader_from( path=fn, validation_stringency=ValidationStringency.Silent, scheme=scheme) self.assertEqual(reader.scheme().version(), scheme.version()) self.assertEqual(reader.header().version(), scheme.version()) self.assertEqual(reader.header().sort_order().name(), Coordinate().name()) with self.assertRaises(ValueError): records = [record for record in reader] reader.close()
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) reader = MafReader.reader_from( path=options.input, validation_stringency=options.validation_stringency, scheme=options.scheme) writer = writer_from_reader(reader=reader, options=options) sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=options.sort_order, scheme=writer.header().scheme(), fasta_index=options.fasta_index) # add the records to the sorter n = 0 for record in reader: sorter += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Sorted %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Sorted %d records" % n) # read from the sorter n = 0 for record in sorter: writer += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Wrote %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Wrote %d records" % n) sorter.close() reader.close() writer.close()
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) reader = MafReader.reader_from( path=options.input, validation_stringency=options.validation_stringency, scheme=options.scheme) writer = writer_from_reader(reader=reader, options=options) n = 0 for record in reader: writer += record n = n + 1 if options.output and n % View.print_every_n_records == 0: logger.info("Processed %d records" % n) if options.output and (n == 0 or n % View.print_every_n_records != 0): logger.info("Processed %d records" % n) reader.close() writer.close()
def test_reader_from_with_scheme(self): scheme = TestMafReader.TestScheme() header = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, scheme.version(), ) column_names = scheme.column_names() lines = [ header, "\t".join(column_names), "\t".join(["cell-1-1", "1.314", "cell-1-2"]), "\t".join(["cell-2-1", "2.314", "cell-2-2"]), "\t".join(["cell-3-1", "3.314", "cell-3-2"]), ] fh, fn = tmp_file(lines=lines) fh.close() reader = MafReader.reader_from( path=fn, validation_stringency=ValidationStringency.Silent, scheme=scheme) records = [record for record in reader] self.assertEqual(reader.scheme().version(), scheme.version()) self.assertEqual(reader.header().version(), scheme.version()) self.assertEqual(len(reader.header()), 1) self.assertEqual(len(records), 3) self.assertListEqual([r["str1"].value for r in records], ["cell-1-1", "cell-2-1", "cell-3-1"]) self.assertListEqual([r["float"].value for r in records], [1.314, 2.314, 3.314]) reader.close()
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options["input_maf"], validation_stringency=ValidationStringency.Strict, ) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"]) nonexonic_set = set(["NonExonic"]) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record["callers"].value if (len(callers) >= self.options["min_callers"] and record["Mutation_Status"].value.value == "Somatic"): self.metrics.add_sample_swap_metric(record) gdc_filters = record["GDC_FILTER"].value gfset = set(gdc_filters) if self.is_hotspot(record): other_filts = gfset - hotspot_gdc_set if len(other_filts) == 0: self.write_record(record) elif len(other_filts - nonexonic_set ) == 0 and self.is_splice(record): # Rescue splicing if NonExonic self.write_record(record) # Rescue splicing if NonExonic elif len(gfset - nonexonic_set) == 0 and self.is_splice(record): self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()