Beispiel #1
0
 def _set_checker_and_sorter(self):
     """Set the sort order checker and sorter.  Must be called **after** 
     the scheme has been set."""
     if self._assume_sorted or not self._header.sort_order().sort_key():
         self._checker = SortOrderChecker(self._header.sort_order())
         self._sorter = None
     else:
         self._checker = None
         self._sorter = MafSorter(
             sort_order_name=self._header.sort_order().name(),
             scheme=self._scheme)
Beispiel #2
0
    def test_sorter_with_sort_order_args(self):
        lines = [
            "chr1\t248956422\t112\t70\t71"
            "chr2\t242193529\t252513167\t70\t71",
            "chr3\t198295559\t498166716\t70\t71",
            "chr4\t190214555\t699295181\t70\t71",
            "chr5\t181538259\t892227221\t70\t71",
            "chr6\t170805979\t1076358996\t70\t71",
            "chr7\t159345973\t1249605173\t70\t71",
            "chr8\t145138636\t1411227630\t70\t71",
            "chr9\t138394717\t1558439788\t70\t71",
            "chr10\t133797422\t1698811686\t70\t71",
        ]
        fd, fn = tmp_file(lines=lines)

        sorter = MafSorter(
            sort_order_name=BarcodesAndCoordinate.name(),
            max_objects_in_ram=100,
            fasta_index=fn,
        )

        self.__test_sorter(sorter=sorter, chromosome="chr5")

        with self.assertRaises(ValueError):
            self.__test_sorter(sorter=sorter, chromosome="1")

        fd.close()
        os.remove(fn)
Beispiel #3
0
 def test_sorter_with_scheme(self):
     scheme = DummyScheme()
     sorter = MafSorter(
         sort_order_name=BarcodesAndCoordinate.name(),
         scheme=scheme,
         max_objects_in_ram=100,
     )
     self.__test_sorter(sorter=sorter, with_scheme=True)
Beispiel #4
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        reader = MafReader.reader_from(
            path=options.input,
            validation_stringency=options.validation_stringency,
            scheme=options.scheme)

        writer = writer_from_reader(reader=reader, options=options)

        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=options.sort_order,
                           scheme=writer.header().scheme(),
                           fasta_index=options.fasta_index)

        # add the records to the sorter
        n = 0
        for record in reader:
            sorter += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Sorted %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Sorted %d records" % n)

        # read from the sorter
        n = 0
        for record in sorter:
            writer += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Wrote %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Wrote %d records" % n)

        sorter.close()
        reader.close()
        writer.close()
    def do_work(self):
        """Main wrapper function for running protect MAF merging"""

        # Reader
        self.load_readers()

        # Header
        self.setup_maf_header()

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)

        # Sorter
        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=BarcodesAndCoordinate.name(),
                           scheme=self.maf_header.scheme(),
                           contigs=self.maf_header.contigs())

        # Merger
        self._merger = MafRecordMerger_1_0_0(self._scheme)

        # Overlap iterator
        o_iter = LocatableOverlapIterator(
            self.maf_readers,
            contigs=self.maf_header.contigs(),
            peekable_iterator_class=FilteringPeekableIterator)

        # ndp filter
        ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth'])
        ndp_tag = ndp_filter.tags[0]

        # Counts
        processed = 0
        try:
            for record in o_iter:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} overlapping intervals...".format(
                            processed))

                result = OverlapSet(record, self.callers)

                for maf_record in self._merger.merge_records(result):
                    if maf_record is not None:
                        # Recheck normal depth
                        gdc_filters = maf_record['GDC_FILTER'].value
                        has_tag = ndp_tag in gdc_filters
                        ndp = ndp_filter.filter(maf_record)
                        if has_tag != ndp:
                            if ndp:
                                gdc_filters.extend(ndp_filter.tags)
                            else:
                                gdc_filters = list(
                                    filter(lambda x: x != ndp_filter.tags[0],
                                           gdc_filters))

                            maf_record["GDC_FILTER"] = get_builder(
                                "GDC_FILTER",
                                self._scheme,
                                value=sorted(gdc_filters))

                        # Add to sorter
                        sorter += maf_record

                processed += 1

            self.logger.info(
                "Writing {0} sorted, merged records...".format(processed))

            # Writer
            self.maf_writer = MafWriter.from_path(
                path=self.options['output_maf'],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict)

            counter = 0
            for record in sorter:
                if counter > 0 and counter % 1000 == 0:
                    self.logger.info(
                        "Wrote {0} sorted, merged records...".format(counter))
                self.maf_writer += record
                counter += 1

            self.logger.info(
                "Finished writing {0} sorted, merged records.".format(counter))

        finally:
            for reader in self.maf_readers:
                reader.close()

            sorter.close()

            if self.maf_writer:
                self.maf_writer.close()
    def do_work(self):
        """Main wrapper function for running vcf2maf"""
        self.logger.info(
            "Processing input vcf {0}...".format(self.options["input_vcf"])
        )

        # Initialize the maf file
        self.setup_maf_header()

        sorter = MafSorter(
            max_objects_in_ram=100000,
            sort_order_name=BarcodesAndCoordinate.name(),
            scheme=self.maf_header.scheme(),
            fasta_index=self.options["reference_fasta_index"],
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Initialize vcf reader
        vcf_object = pysam.VariantFile(self.options["input_vcf"])
        tumor_sample_id = self.options["tumor_vcf_id"]
        normal_sample_id = self.options["normal_vcf_id"]
        is_tumor_only = self.options["tumor_only"]

        try:
            # Validate samples
            tumor_idx = assert_sample_in_header(
                vcf_object, self.options["tumor_vcf_id"]
            )
            normal_idx = assert_sample_in_header(
                vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only
            )

            # extract annotation from header
            ann_cols_format, vep_key = extract_annotation_from_header(
                vcf_object, vep_key="CSQ"
            )

            # Initialize annotators
            self.setup_annotators()

            # Initialize filters
            self.setup_filters()

            # Convert
            line = 0
            for vcf_record in vcf_object.fetch():

                line += 1

                if line % 1000 == 0:
                    self.logger.info("Processed {0} records...".format(line))

                # Extract data
                data = self.extract(
                    tumor_sample_id,
                    normal_sample_id,
                    tumor_idx,
                    normal_idx,
                    ann_cols_format,
                    vep_key,
                    vcf_record,
                    is_tumor_only,
                )

                # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ?
                if (
                    not data["selected_effect"]["IMPACT"]
                    or data["selected_effect"]["One_Consequence"] == "?"
                ):
                    self.logger.warn(
                        "Skipping record with unknown impact or consequence: {0} - {1}".format(
                            data["selected_effect"]["IMPACT"],
                            data["selected_effect"]["One_Consequence"],
                        )
                    )
                    continue

                # Transform
                maf_record = self.transform(
                    vcf_record, data, is_tumor_only, line_number=line
                )

                # Add to sorter
                sorter += maf_record

            # Write
            self.logger.info("Writing {0} sorted records...".format(line))
            self.maf_writer = MafWriter.from_path(
                path=self.options["output_maf"],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict,
            )

            counter = 0
            for record in sorter:

                counter += 1

                if counter % 1000 == 0:
                    self.logger.info("Wrote {0} records...".format(counter))

                self.maf_writer += record

            self.logger.info("Finished writing {0} records".format(counter))

        finally:
            vcf_object.close()
            sorter.close()
            if self.maf_writer:
                self.maf_writer.close()
            for anno in self.annotators:
                if self.annotators[anno]:
                    self.annotators[anno].shutdown()

        self.logger.info("Finished")
Beispiel #7
0
class MafWriter(object):
    """A writer of a MAF file"""
    def __init__(
        self,
        handle: IO,
        header: MafHeader,
        validation_stringency: ValidationStringency = ValidationStringency.
        Strict,
        assume_sorted: bool = True,
    ):
        self._handle = handle
        self._header = header
        self._logger: logging.Logger = Logger.get_logger(
            self.__class__.__name__)
        self._assume_sorted = assume_sorted
        self._sorter: Optional[MafSorter] = None
        self._checker: Optional[SortOrderChecker] = None

        self.validation_stringency = (ValidationStringency.Silent if
                                      (validation_stringency is None) else
                                      validation_stringency)

        # validate the header
        self._header.validate(validation_stringency=self.validation_stringency,
                              logger=self._logger)

        # write the header
        if len(self._header) > 0:
            self._handle.write(str(self._header) + "\n")

        # write the column names if we have a scheme
        self._scheme = self._header.scheme()
        if self._scheme:
            self._handle.write(
                MafRecord.ColumnSeparator.join(self._scheme.column_names()) +
                "\n")
            self._set_checker_and_sorter()

    def _set_checker_and_sorter(self) -> None:
        """Set the sort order checker and sorter.  Must be called **after**
        the scheme has been set."""
        if self._assume_sorted or not self._header.sort_order().sort_key(
        ):  # type: ignore
            self._checker = SortOrderChecker(self._header.sort_order())
            self._sorter = None
        else:
            self._checker = None
            self._sorter = MafSorter(
                sort_order_name=self._header.sort_order().name(),
                scheme=self._scheme  # type: ignore
            )

    def header(self) -> MafHeader:
        """Get the underlying MafHeader."""
        return self._header

    def __iadd__(self, record: MafRecord) -> 'MafWriter':
        """Write a MafRecord."""

        # set the scheme and write the column names if not already written
        if not self._scheme:
            column_names = [str(key) for key in record.keys()]
            self._scheme = NoRestrictionsScheme(column_names=column_names)
            self._handle.write(
                MafRecord.ColumnSeparator.join(self._scheme.column_names()) +
                "\n")
            self._set_checker_and_sorter()

        # validate the record
        record.validate(
            validation_stringency=self.validation_stringency,
            logger=self._logger,
            reset_errors=True,
            scheme=self._scheme,
        )

        # either write it directly, or add it to the sorter
        if self._sorter:
            self._sorter += record  # type: ignore
        else:
            self._handle.write(str(record) + "\n")

        return self

    def write(self, record: MafRecord) -> 'MafWriter':
        """Write a MafRecord."""
        return self.__iadd__(record)

    def close(self) -> None:
        """Closes the underlying file handle, and writes the records if the
        output was to be sorted."""
        if self._sorter:
            for rec in self._sorter:
                self._handle.write(str(rec) + "\n")
            self._sorter.close()
        self._handle.close()

    @classmethod
    def from_fd(
        cls,
        desc: IO,
        header: MafHeader,
        validation_stringency: ValidationStringency = ValidationStringency.
        Strict,
        assume_sorted: bool = True,
    ) -> 'MafWriter':
        """Create a MafWriter from the given file handle."""
        return MafWriter(
            handle=desc,
            header=header,
            validation_stringency=validation_stringency,
            assume_sorted=assume_sorted,
        )

    @classmethod
    def from_path(
        cls,
        path: str,
        header: MafHeader,
        validation_stringency: ValidationStringency = ValidationStringency.
        Strict,
        assume_sorted: bool = True,
    ) -> 'MafWriter':
        """Create a MafWriter from the given path ."""
        if path.endswith(".gz"):
            handle = gzip.open(path, "wt")
        else:
            handle = open(path, "w")
        return MafWriter.from_fd(
            desc=handle,
            header=header,
            validation_stringency=validation_stringency,
            assume_sorted=assume_sorted,
        )
Beispiel #8
0
 def test_sorter_default(self):
     sorter = MafSorter(sort_order_name=BarcodesAndCoordinate.name(),
                        max_objects_in_ram=100)
     self.__test_sorter(sorter=sorter)