Example #1
0
def writer_from_reader(reader, options):
    """
    Builds a writer from the given reader and command line options.
    :param options: the command line options, which should have "output",
    "version", and "annotation" defined.
    :param reader: the reader from which to records will be obtained
    :return:
    """
    out_header = MafHeader.from_reader(
        reader=reader,
        version=options.version,
        annotation=options.annotation,
        sort_order=options.sort_order \
            if hasattr(options, 'sort_order') else None
    )

    if options.output:
        writer = MafWriter.from_path(
            path=options.output,
            header=out_header,
            validation_stringency=options.validation_stringency)
    else:
        writer = MafWriter.from_fd(
            desc=sys.stdout,
            header=out_header,
            validation_stringency=options.validation_stringency)
    return writer
    def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options['input_maf'],
            validation_stringency=ValidationStringency.Strict)

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options['output_maf'],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict)

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(['gdc_pon', 'common_in_exac'])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record['callers'].value
                if len(callers) >= self.options['min_callers'] and \
                  record['Mutation_Status'].value.value == 'Somatic':

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record['GDC_FILTER'].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        if len(gfset - hotspot_gdc_set) == 0:
                            self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()
Example #3
0
    def test_empty_file(self):
        fd, path = tempfile.mkstemp()

        # No logging to stderr/stdout
        with captured_output() as (stdout, stderr):
            writer = MafWriter.from_path(
                path=path,
                header=MafHeader(),
                validation_stringency=ValidationStringency.Silent,
            )
            writer.close()
            self.assertEqual(read_lines(path), [])
            self.assertEqual(str(writer.header()), "")
        stdout = stdout.getvalue().rstrip('\r\n').split("\n")
        stderr = stderr.getvalue().rstrip('\r\n').split("\n")
        self.assertListEqual(stdout, [''])
        self.assertListEqual(stderr, [''])

        # Logging to stderr/stdout
        with captured_output() as (stdout, stderr):
            writer = MafWriter.from_path(
                path=path,
                header=MafHeader(),
                validation_stringency=ValidationStringency.Lenient,
            )
            writer.close()
            self.assertEqual(read_lines(path), [])
            self.assertEqual(str(writer.header()), "")
        stdout = stdout.getvalue().rstrip('\r\n').split("\n")
        stderr = stderr.getvalue().rstrip('\r\n').split("\n")
        self.assertListEqual(stdout, [''])
        self.assertListEqualAndIn(
            ['HEADER_MISSING_VERSION', 'HEADER_MISSING_ANNOTATION_SPEC'],
            stderr)

        #  Exceptions
        with captured_output():
            with self.assertRaises(MafFormatException) as context:
                writer = MafWriter.from_path(
                    path=path,
                    header=MafHeader(),
                    validation_stringency=ValidationStringency.Strict,
                )
            self.assertEqual(context.exception.tpe,
                             MafValidationErrorType.HEADER_MISSING_VERSION)
Example #4
0
    def test_record_validation_error(self):
        scheme = TestMafWriter.TestScheme()
        fd, path = tempfile.mkstemp()

        # Create the header
        header_lines = (MafHeader.scheme_header_lines(scheme) +
                        ["#key1 value1", "#key2 value2"] +
                        ["str1\tNone\tstr2"])
        header = MafHeader.from_lines(
            lines=header_lines,
            validation_stringency=ValidationStringency.Silent)

        # Create the record
        values = ["string2", "error", "string1"]
        record_line = MafRecord.ColumnSeparator.join(values)
        record = MafRecord.from_line(
            line=record_line,
            scheme=scheme,
            line_number=1,
            validation_stringency=ValidationStringency.Silent,
        )

        # Write the header, and the record twice
        with captured_output() as (stdout, stderr):
            writer = MafWriter.from_path(
                header=header,
                validation_stringency=ValidationStringency.Lenient,
                path=path,
            )
            writer += record
            writer.write(record)
            writer.close()
        stdout = stdout.getvalue().rstrip('\r\n').split("\n")
        stderr = stderr.getvalue().rstrip('\r\n').split("\n")
        self.assertListEqual(stdout, [''])

        # The errors that should be written stderr
        errors = [
            "HEADER_UNSUPPORTED_VERSION",
            "HEADER_UNSUPPORTED_ANNOTATION_SPEC",
            "RECORD_COLUMN_WITH_NO_VALUE",
            "RECORD_COLUMN_WITH_NO_VALUE",
        ]
        self.assertListEqualAndIn(errors, stderr)

        # The second column should be None
        err_record_line = record_line.replace("error", "None")
        self.assertListEqual(read_lines(path),
                             header_lines + [err_record_line, err_record_line])
Example #5
0
    def test_close(self):
        fd, path = tempfile.mkstemp()

        lines = [
            TestMafWriter.__version_line,
            TestMafWriter.__annotation_line,
            "#key1 value1",
            "#key2 value2",
            TestMafWriter.__keys_line,
        ]
        header = MafHeader.from_lines(lines=lines)
        writer = MafWriter.from_path(header=header, path=path)
        writer._handle.write("LAST")  # Naughty
        writer.close()
        self.assertListEqual(read_lines(path), lines + ["LAST"])

        with self.assertRaises(ValueError):
            writer._handle.write("Oh no")
Example #6
0
    def add_records(self):
        scheme = TestMafWriter.TestScheme()
        fd, path = tempfile.mkstemp()

        header_lines = MafHeader.scheme_header_lines(scheme) + [
            "#key1 value1",
            "#key2 value2",
        ]
        header = MafHeader.from_lines(lines=header_lines)
        writer = MafWriter.from_path(header=header, path=path)
        values = ["string2", "3.14", "string1"]
        record_line = MafRecord.ColumnSeparator.join(values)
        record = MafRecord.from_line(line=record_line,
                                     scheme=scheme,
                                     line_number=1)
        writer += record
        writer.write(record)
        writer.close()

        self.assertListEqual(read_lines(path),
                             header_lines + [record_line, record_line])
Example #7
0
    def test_with_sorting(self):
        scheme = TestMafWriter.TestCoordinateScheme()
        fd, path = tempfile.mkstemp()

        # Create the header
        header_lines = (MafHeader.scheme_header_lines(scheme) +
                        ["#key1 value1", "#key2 value2"] + [
                            "%s%s %s" % (
                                MafHeader.HeaderLineStartSymbol,
                                MafHeader.SortOrderKey,
                                Coordinate().name(),
                            )
                        ] + ["\t".join(scheme.column_names())])
        header = MafHeader.from_lines(
            lines=header_lines,
            validation_stringency=ValidationStringency.Silent)

        # Write the header, and the record twice
        writer = MafWriter.from_path(
            header=header,
            validation_stringency=ValidationStringency.Lenient,
            path=path,
            assume_sorted=False,
        )
        writer += TestMafWriter.DummyRecord("chr1", 2, 2)
        writer += TestMafWriter.DummyRecord("chr1", 3, 3)
        writer += TestMafWriter.DummyRecord("chr1", 4, 4)
        writer.close()

        reader = MafReader.reader_from(path=path, scheme=scheme)
        header = reader.header()
        records = [rec for rec in reader]
        reader.close()

        self.assertEqual(header.sort_order().name(), Coordinate.name())

        self.assertListEqual([r["Start_Position"].value for r in records],
                             [2, 3, 4])
        self.assertListEqual([r["End_Position"].value for r in records],
                             [2, 3, 4])
Example #8
0
    def test_gz_support(self):
        fd, path = tempfile.mkstemp(suffix=".gz")

        lines = [
            TestMafWriter.__version_line,
            TestMafWriter.__annotation_line,
            "#key1 value1",
            "#key2 value2",
            TestMafWriter.__keys_line,
        ]
        with captured_output() as (stdout, stderr):
            header = MafHeader.from_lines(lines=lines)
            writer = MafWriter.from_path(header=header, path=path)
            writer.close()
            self.assertListEqual(read_lines(path), lines)
            self.assertEqual(
                str(writer.header()) + "\n" + TestMafWriter.__keys_line,
                "\n".join(lines),
            )
        stdout = stdout.getvalue().rstrip('\r\n').split("\n")
        stderr = stderr.getvalue().rstrip('\r\n').split("\n")
        self.assertListEqual(stdout, [''])
        self.assertListEqual(stderr, [''])
    def do_work(self):
        """Main wrapper function for running protect MAF merging"""

        # Reader
        self.load_readers()

        # Header
        self.setup_maf_header()

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)

        # Sorter
        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=BarcodesAndCoordinate.name(),
                           scheme=self.maf_header.scheme(),
                           contigs=self.maf_header.contigs())

        # Merger
        self._merger = MafRecordMerger_1_0_0(self._scheme)

        # Overlap iterator
        o_iter = LocatableOverlapIterator(
            self.maf_readers,
            contigs=self.maf_header.contigs(),
            peekable_iterator_class=FilteringPeekableIterator)

        # ndp filter
        ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth'])
        ndp_tag = ndp_filter.tags[0]

        # Counts
        processed = 0
        try:
            for record in o_iter:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} overlapping intervals...".format(
                            processed))

                result = OverlapSet(record, self.callers)

                for maf_record in self._merger.merge_records(result):
                    if maf_record is not None:
                        # Recheck normal depth
                        gdc_filters = maf_record['GDC_FILTER'].value
                        has_tag = ndp_tag in gdc_filters
                        ndp = ndp_filter.filter(maf_record)
                        if has_tag != ndp:
                            if ndp:
                                gdc_filters.extend(ndp_filter.tags)
                            else:
                                gdc_filters = list(
                                    filter(lambda x: x != ndp_filter.tags[0],
                                           gdc_filters))

                            maf_record["GDC_FILTER"] = get_builder(
                                "GDC_FILTER",
                                self._scheme,
                                value=sorted(gdc_filters))

                        # Add to sorter
                        sorter += maf_record

                processed += 1

            self.logger.info(
                "Writing {0} sorted, merged records...".format(processed))

            # Writer
            self.maf_writer = MafWriter.from_path(
                path=self.options['output_maf'],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict)

            counter = 0
            for record in sorter:
                if counter > 0 and counter % 1000 == 0:
                    self.logger.info(
                        "Wrote {0} sorted, merged records...".format(counter))
                self.maf_writer += record
                counter += 1

            self.logger.info(
                "Finished writing {0} sorted, merged records.".format(counter))

        finally:
            for reader in self.maf_readers:
                reader.close()

            sorter.close()

            if self.maf_writer:
                self.maf_writer.close()
    def do_work(self):
        """Main wrapper function for running vcf2maf"""
        self.logger.info(
            "Processing input vcf {0}...".format(self.options["input_vcf"])
        )

        # Initialize the maf file
        self.setup_maf_header()

        sorter = MafSorter(
            max_objects_in_ram=100000,
            sort_order_name=BarcodesAndCoordinate.name(),
            scheme=self.maf_header.scheme(),
            fasta_index=self.options["reference_fasta_index"],
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Initialize vcf reader
        vcf_object = pysam.VariantFile(self.options["input_vcf"])
        tumor_sample_id = self.options["tumor_vcf_id"]
        normal_sample_id = self.options["normal_vcf_id"]
        is_tumor_only = self.options["tumor_only"]

        try:
            # Validate samples
            tumor_idx = assert_sample_in_header(
                vcf_object, self.options["tumor_vcf_id"]
            )
            normal_idx = assert_sample_in_header(
                vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only
            )

            # extract annotation from header
            ann_cols_format, vep_key = extract_annotation_from_header(
                vcf_object, vep_key="CSQ"
            )

            # Initialize annotators
            self.setup_annotators()

            # Initialize filters
            self.setup_filters()

            # Convert
            line = 0
            for vcf_record in vcf_object.fetch():

                line += 1

                if line % 1000 == 0:
                    self.logger.info("Processed {0} records...".format(line))

                # Extract data
                data = self.extract(
                    tumor_sample_id,
                    normal_sample_id,
                    tumor_idx,
                    normal_idx,
                    ann_cols_format,
                    vep_key,
                    vcf_record,
                    is_tumor_only,
                )

                # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ?
                if (
                    not data["selected_effect"]["IMPACT"]
                    or data["selected_effect"]["One_Consequence"] == "?"
                ):
                    self.logger.warn(
                        "Skipping record with unknown impact or consequence: {0} - {1}".format(
                            data["selected_effect"]["IMPACT"],
                            data["selected_effect"]["One_Consequence"],
                        )
                    )
                    continue

                # Transform
                maf_record = self.transform(
                    vcf_record, data, is_tumor_only, line_number=line
                )

                # Add to sorter
                sorter += maf_record

            # Write
            self.logger.info("Writing {0} sorted records...".format(line))
            self.maf_writer = MafWriter.from_path(
                path=self.options["output_maf"],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict,
            )

            counter = 0
            for record in sorter:

                counter += 1

                if counter % 1000 == 0:
                    self.logger.info("Wrote {0} records...".format(counter))

                self.maf_writer += record

            self.logger.info("Finished writing {0} records".format(counter))

        finally:
            vcf_object.close()
            sorter.close()
            if self.maf_writer:
                self.maf_writer.close()
            for anno in self.annotators:
                if self.annotators[anno]:
                    self.annotators[anno].shutdown()

        self.logger.info("Finished")
    def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options["input_maf"],
            validation_stringency=ValidationStringency.Strict,
        )

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options["output_maf"],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict,
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"])
        nonexonic_set = set(["NonExonic"])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record["callers"].value
                if (len(callers) >= self.options["min_callers"] and
                        record["Mutation_Status"].value.value == "Somatic"):

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record["GDC_FILTER"].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        other_filts = gfset - hotspot_gdc_set
                        if len(other_filts) == 0:
                            self.write_record(record)
                        elif len(other_filts - nonexonic_set
                                 ) == 0 and self.is_splice(record):
                            # Rescue splicing if NonExonic
                            self.write_record(record)

                    # Rescue splicing if NonExonic
                    elif len(gfset -
                             nonexonic_set) == 0 and self.is_splice(record):
                        self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()