def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options['input_maf'], validation_stringency=ValidationStringency.Strict) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(['gdc_pon', 'common_in_exac']) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record['callers'].value if len(callers) >= self.options['min_callers'] and \ record['Mutation_Status'].value.value == 'Somatic': self.metrics.add_sample_swap_metric(record) gdc_filters = record['GDC_FILTER'].value gfset = set(gdc_filters) if self.is_hotspot(record): if len(gfset - hotspot_gdc_set) == 0: self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()
def do_work(self): """Main wrapper function for running vcf2maf""" self.logger.info( "Processing input vcf {0}...".format(self.options["input_vcf"]) ) # Initialize the maf file self.setup_maf_header() sorter = MafSorter( max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), fasta_index=self.options["reference_fasta_index"], ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Initialize vcf reader vcf_object = pysam.VariantFile(self.options["input_vcf"]) tumor_sample_id = self.options["tumor_vcf_id"] normal_sample_id = self.options["normal_vcf_id"] is_tumor_only = self.options["tumor_only"] try: # Validate samples tumor_idx = assert_sample_in_header( vcf_object, self.options["tumor_vcf_id"] ) normal_idx = assert_sample_in_header( vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only ) # extract annotation from header ann_cols_format, vep_key = extract_annotation_from_header( vcf_object, vep_key="CSQ" ) # Initialize annotators self.setup_annotators() # Initialize filters self.setup_filters() # Convert line = 0 for vcf_record in vcf_object.fetch(): line += 1 if line % 1000 == 0: self.logger.info("Processed {0} records...".format(line)) # Extract data data = self.extract( tumor_sample_id, normal_sample_id, tumor_idx, normal_idx, ann_cols_format, vep_key, vcf_record, is_tumor_only, ) # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ? if ( not data["selected_effect"]["IMPACT"] or data["selected_effect"]["One_Consequence"] == "?" ): self.logger.warn( "Skipping record with unknown impact or consequence: {0} - {1}".format( data["selected_effect"]["IMPACT"], data["selected_effect"]["One_Consequence"], ) ) continue # Transform maf_record = self.transform( vcf_record, data, is_tumor_only, line_number=line ) # Add to sorter sorter += maf_record # Write self.logger.info("Writing {0} sorted records...".format(line)) self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) counter = 0 for record in sorter: counter += 1 if counter % 1000 == 0: self.logger.info("Wrote {0} records...".format(counter)) self.maf_writer += record self.logger.info("Finished writing {0} records".format(counter)) finally: vcf_object.close() sorter.close() if self.maf_writer: self.maf_writer.close() for anno in self.annotators: if self.annotators[anno]: self.annotators[anno].shutdown() self.logger.info("Finished")
def do_work(self): """Main wrapper function for running protect MAF merging""" # Reader self.load_readers() # Header self.setup_maf_header() self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) # Sorter sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), contigs=self.maf_header.contigs()) # Merger self._merger = MafRecordMerger_1_0_0(self._scheme) # Overlap iterator o_iter = LocatableOverlapIterator( self.maf_readers, contigs=self.maf_header.contigs(), peekable_iterator_class=FilteringPeekableIterator) # ndp filter ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth']) ndp_tag = ndp_filter.tags[0] # Counts processed = 0 try: for record in o_iter: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} overlapping intervals...".format( processed)) result = OverlapSet(record, self.callers) for maf_record in self._merger.merge_records(result): if maf_record is not None: # Recheck normal depth gdc_filters = maf_record['GDC_FILTER'].value has_tag = ndp_tag in gdc_filters ndp = ndp_filter.filter(maf_record) if has_tag != ndp: if ndp: gdc_filters.extend(ndp_filter.tags) else: gdc_filters = list( filter(lambda x: x != ndp_filter.tags[0], gdc_filters)) maf_record["GDC_FILTER"] = get_builder( "GDC_FILTER", self._scheme, value=sorted(gdc_filters)) # Add to sorter sorter += maf_record processed += 1 self.logger.info( "Writing {0} sorted, merged records...".format(processed)) # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) counter = 0 for record in sorter: if counter > 0 and counter % 1000 == 0: self.logger.info( "Wrote {0} sorted, merged records...".format(counter)) self.maf_writer += record counter += 1 self.logger.info( "Finished writing {0} sorted, merged records.".format(counter)) finally: for reader in self.maf_readers: reader.close() sorter.close() if self.maf_writer: self.maf_writer.close()
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options["input_maf"], validation_stringency=ValidationStringency.Strict, ) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"]) nonexonic_set = set(["NonExonic"]) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record["callers"].value if (len(callers) >= self.options["min_callers"] and record["Mutation_Status"].value.value == "Somatic"): self.metrics.add_sample_swap_metric(record) gdc_filters = record["GDC_FILTER"].value gfset = set(gdc_filters) if self.is_hotspot(record): other_filts = gfset - hotspot_gdc_set if len(other_filts) == 0: self.write_record(record) elif len(other_filts - nonexonic_set ) == 0 and self.is_splice(record): # Rescue splicing if NonExonic self.write_record(record) # Rescue splicing if NonExonic elif len(gfset - nonexonic_set) == 0 and self.is_splice(record): self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()