def _set_checker_and_sorter(self): """Set the sort order checker and sorter. Must be called **after** the scheme has been set.""" if self._assume_sorted or not self._header.sort_order().sort_key(): self._checker = SortOrderChecker(self._header.sort_order()) self._sorter = None else: self._checker = None self._sorter = MafSorter( sort_order_name=self._header.sort_order().name(), scheme=self._scheme)
def test_sorter_with_sort_order_args(self): lines = [ "chr1\t248956422\t112\t70\t71" "chr2\t242193529\t252513167\t70\t71", "chr3\t198295559\t498166716\t70\t71", "chr4\t190214555\t699295181\t70\t71", "chr5\t181538259\t892227221\t70\t71", "chr6\t170805979\t1076358996\t70\t71", "chr7\t159345973\t1249605173\t70\t71", "chr8\t145138636\t1411227630\t70\t71", "chr9\t138394717\t1558439788\t70\t71", "chr10\t133797422\t1698811686\t70\t71", ] fd, fn = tmp_file(lines=lines) sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100, fasta_index=fn, ) self.__test_sorter(sorter=sorter, chromosome="chr5") with self.assertRaises(ValueError): self.__test_sorter(sorter=sorter, chromosome="1") fd.close() os.remove(fn)
def test_sorter_with_scheme(self): scheme = DummyScheme() sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), scheme=scheme, max_objects_in_ram=100, ) self.__test_sorter(sorter=sorter, with_scheme=True)
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) reader = MafReader.reader_from( path=options.input, validation_stringency=options.validation_stringency, scheme=options.scheme) writer = writer_from_reader(reader=reader, options=options) sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=options.sort_order, scheme=writer.header().scheme(), fasta_index=options.fasta_index) # add the records to the sorter n = 0 for record in reader: sorter += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Sorted %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Sorted %d records" % n) # read from the sorter n = 0 for record in sorter: writer += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Wrote %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Wrote %d records" % n) sorter.close() reader.close() writer.close()
def do_work(self): """Main wrapper function for running protect MAF merging""" # Reader self.load_readers() # Header self.setup_maf_header() self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) # Sorter sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), contigs=self.maf_header.contigs()) # Merger self._merger = MafRecordMerger_1_0_0(self._scheme) # Overlap iterator o_iter = LocatableOverlapIterator( self.maf_readers, contigs=self.maf_header.contigs(), peekable_iterator_class=FilteringPeekableIterator) # ndp filter ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth']) ndp_tag = ndp_filter.tags[0] # Counts processed = 0 try: for record in o_iter: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} overlapping intervals...".format( processed)) result = OverlapSet(record, self.callers) for maf_record in self._merger.merge_records(result): if maf_record is not None: # Recheck normal depth gdc_filters = maf_record['GDC_FILTER'].value has_tag = ndp_tag in gdc_filters ndp = ndp_filter.filter(maf_record) if has_tag != ndp: if ndp: gdc_filters.extend(ndp_filter.tags) else: gdc_filters = list( filter(lambda x: x != ndp_filter.tags[0], gdc_filters)) maf_record["GDC_FILTER"] = get_builder( "GDC_FILTER", self._scheme, value=sorted(gdc_filters)) # Add to sorter sorter += maf_record processed += 1 self.logger.info( "Writing {0} sorted, merged records...".format(processed)) # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) counter = 0 for record in sorter: if counter > 0 and counter % 1000 == 0: self.logger.info( "Wrote {0} sorted, merged records...".format(counter)) self.maf_writer += record counter += 1 self.logger.info( "Finished writing {0} sorted, merged records.".format(counter)) finally: for reader in self.maf_readers: reader.close() sorter.close() if self.maf_writer: self.maf_writer.close()
def do_work(self): """Main wrapper function for running vcf2maf""" self.logger.info( "Processing input vcf {0}...".format(self.options["input_vcf"]) ) # Initialize the maf file self.setup_maf_header() sorter = MafSorter( max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), fasta_index=self.options["reference_fasta_index"], ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Initialize vcf reader vcf_object = pysam.VariantFile(self.options["input_vcf"]) tumor_sample_id = self.options["tumor_vcf_id"] normal_sample_id = self.options["normal_vcf_id"] is_tumor_only = self.options["tumor_only"] try: # Validate samples tumor_idx = assert_sample_in_header( vcf_object, self.options["tumor_vcf_id"] ) normal_idx = assert_sample_in_header( vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only ) # extract annotation from header ann_cols_format, vep_key = extract_annotation_from_header( vcf_object, vep_key="CSQ" ) # Initialize annotators self.setup_annotators() # Initialize filters self.setup_filters() # Convert line = 0 for vcf_record in vcf_object.fetch(): line += 1 if line % 1000 == 0: self.logger.info("Processed {0} records...".format(line)) # Extract data data = self.extract( tumor_sample_id, normal_sample_id, tumor_idx, normal_idx, ann_cols_format, vep_key, vcf_record, is_tumor_only, ) # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ? if ( not data["selected_effect"]["IMPACT"] or data["selected_effect"]["One_Consequence"] == "?" ): self.logger.warn( "Skipping record with unknown impact or consequence: {0} - {1}".format( data["selected_effect"]["IMPACT"], data["selected_effect"]["One_Consequence"], ) ) continue # Transform maf_record = self.transform( vcf_record, data, is_tumor_only, line_number=line ) # Add to sorter sorter += maf_record # Write self.logger.info("Writing {0} sorted records...".format(line)) self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) counter = 0 for record in sorter: counter += 1 if counter % 1000 == 0: self.logger.info("Wrote {0} records...".format(counter)) self.maf_writer += record self.logger.info("Finished writing {0} records".format(counter)) finally: vcf_object.close() sorter.close() if self.maf_writer: self.maf_writer.close() for anno in self.annotators: if self.annotators[anno]: self.annotators[anno].shutdown() self.logger.info("Finished")
class MafWriter(object): """A writer of a MAF file""" def __init__( self, handle: IO, header: MafHeader, validation_stringency: ValidationStringency = ValidationStringency. Strict, assume_sorted: bool = True, ): self._handle = handle self._header = header self._logger: logging.Logger = Logger.get_logger( self.__class__.__name__) self._assume_sorted = assume_sorted self._sorter: Optional[MafSorter] = None self._checker: Optional[SortOrderChecker] = None self.validation_stringency = (ValidationStringency.Silent if (validation_stringency is None) else validation_stringency) # validate the header self._header.validate(validation_stringency=self.validation_stringency, logger=self._logger) # write the header if len(self._header) > 0: self._handle.write(str(self._header) + "\n") # write the column names if we have a scheme self._scheme = self._header.scheme() if self._scheme: self._handle.write( MafRecord.ColumnSeparator.join(self._scheme.column_names()) + "\n") self._set_checker_and_sorter() def _set_checker_and_sorter(self) -> None: """Set the sort order checker and sorter. Must be called **after** the scheme has been set.""" if self._assume_sorted or not self._header.sort_order().sort_key( ): # type: ignore self._checker = SortOrderChecker(self._header.sort_order()) self._sorter = None else: self._checker = None self._sorter = MafSorter( sort_order_name=self._header.sort_order().name(), scheme=self._scheme # type: ignore ) def header(self) -> MafHeader: """Get the underlying MafHeader.""" return self._header def __iadd__(self, record: MafRecord) -> 'MafWriter': """Write a MafRecord.""" # set the scheme and write the column names if not already written if not self._scheme: column_names = [str(key) for key in record.keys()] self._scheme = NoRestrictionsScheme(column_names=column_names) self._handle.write( MafRecord.ColumnSeparator.join(self._scheme.column_names()) + "\n") self._set_checker_and_sorter() # validate the record record.validate( validation_stringency=self.validation_stringency, logger=self._logger, reset_errors=True, scheme=self._scheme, ) # either write it directly, or add it to the sorter if self._sorter: self._sorter += record # type: ignore else: self._handle.write(str(record) + "\n") return self def write(self, record: MafRecord) -> 'MafWriter': """Write a MafRecord.""" return self.__iadd__(record) def close(self) -> None: """Closes the underlying file handle, and writes the records if the output was to be sorted.""" if self._sorter: for rec in self._sorter: self._handle.write(str(rec) + "\n") self._sorter.close() self._handle.close() @classmethod def from_fd( cls, desc: IO, header: MafHeader, validation_stringency: ValidationStringency = ValidationStringency. Strict, assume_sorted: bool = True, ) -> 'MafWriter': """Create a MafWriter from the given file handle.""" return MafWriter( handle=desc, header=header, validation_stringency=validation_stringency, assume_sorted=assume_sorted, ) @classmethod def from_path( cls, path: str, header: MafHeader, validation_stringency: ValidationStringency = ValidationStringency. Strict, assume_sorted: bool = True, ) -> 'MafWriter': """Create a MafWriter from the given path .""" if path.endswith(".gz"): handle = gzip.open(path, "wt") else: handle = open(path, "w") return MafWriter.from_fd( desc=handle, header=header, validation_stringency=validation_stringency, assume_sorted=assume_sorted, )
def test_sorter_default(self): sorter = MafSorter(sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100) self.__test_sorter(sorter=sorter)