Exemple #1
0
    def test_reannotating_with_prepends(self):
        """Test that we will disregard the prepend when looking for fields to write"""
        m = MutationDataFactory.default_create()
        m.createAnnotation('i_foo', "blah", "INPUT")
        m.createAnnotation('foo', "bloop", "some datasource")

        headers = ['i_foo']
        alt_dict = {'i_foo': ['i_i_foo', 'foo']}
        mapping = FieldMapCreator.create_field_map(headers, m, alt_dict, is_render_internal_fields=True,deprioritize_input_annotations=True)
        self.assertTrue(mapping['i_foo'] == 'foo')

        mapping = FieldMapCreator.create_field_map(headers, m, alt_dict, is_render_internal_fields=True,deprioritize_input_annotations=False)
        self.assertTrue(mapping['i_foo'] == 'i_foo')
Exemple #2
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = [
            'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
            'FORMAT', 'NORMAL', 'PRIMARY'
        ]

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()
            self.fieldMap = FieldMapCreator.create_field_map(
                fieldsUsed, m, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp,
                                   outputHeaders,
                                   delimiter="\t",
                                   lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) +
                         " mutations.  Could not render: " +
                         str(unrenderableRows))
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY']

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()
            self.fieldMap = FieldMapCreator.create_field_map(fieldsUsed, m, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) + " mutations.  Could not render: " + str(unrenderableRows))
Exemple #4
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        """ Returns a file name pointing to the maf file that is generated. """
        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        self.logger.info("TCGA MAF output file: " + self._filename)
        self.logger.info("Render starting...")

        requiredColumns = self.config.get("general", "requiredColumns").split(',')
        optionalColumns = self.config.get("general", "optionalColumns").split(',')

        # Create the header list, making sure to preserve order.
        headers = requiredColumns
        headers.extend(optionalColumns)

        # Create a list of annotation names
        try:
            m = mutations.next()
            annotations = MutUtils.getAllAttributeNames(m)
        except StopIteration as si:

            # There are no mutations, so use the config file and metadata to determine what columns to output
            metadataAnnotations = metadata.keys()
            annotations = set(headers).union(metadataAnnotations)
            m = None

        # If we are splitting allelic_depth into two fields, add those to the headers.  Note that the mutations will
        #  be annotated properly later.
        if self._is_splitting_allelic_depth and "allelic_depth" in annotations:
            depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT]
            headers.extend(depth_fields)

        if m is not None:

            # Add columns for the new annotations created as part of collapsing cols
            additional_internal_columns = []
            if self._column_collapser is not None:
                additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix)

            # Create a mapping between column name and annotation name
            field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary,
                                                    self.config.getboolean("general", "displayAnnotations"),
                                                    exposed_fields=self.exposedColumns, prepend=self._prepend,
                                                    deprioritize_input_annotations=self._is_reannotating,
                                                    additional_columns=additional_internal_columns)

            field_map_keys = field_map.keys()
            internal_fields = sorted(list(set(field_map_keys).difference(headers)))
            headers.extend(internal_fields)

        # Initialize the output file and write a header.
        fp = file(self._filename, 'w')
        fp.write("#version " + self.getTcgaMafVersion() + "\n")
        
        for c in comments:
            fp.write("## " + c + "\n")
        
        # Initialize a csv DictWriter
        # Remove headers that start with "_"
        dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n")
        dw.writeheader()
        ctr = 0

        try:
            # Add the NCBI build
            if m is not None:
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                ctr += 1

            for m in mutations:

                # Add the NCBI build
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                
                # Update mutation count and log every 1000 mutations
                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Rendered " + str(ctr) + " mutations.")
        except Exception as e:
            import traceback
            self.logger.error(traceback.format_exc())
            self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ")
            self.logger.error("Incomplete: rendered %d mutations." % (ctr))
            fp.close()
            raise e
        
        fp.close()
        if self._is_entrez_id_message_logged:
            logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.")
        self.logger.info("Rendered all " + str(ctr) + " mutations.")
        return self._filename
Exemple #5
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info(
            "Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = FieldMapCreator.create_field_map(
                    headers,
                    seg,
                    self._alternativeDictionary,
                    is_render_internal_fields=True,
                    prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg

        if i == 0:
            logging.getLogger(__name__).info(
                "No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp,
                                headers,
                                delimiter="\t",
                                lineterminator="\n",
                                extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i, gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." %
                                                 ((i + 1), num_genes))

        fp.close()
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info("Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = FieldMapCreator.create_field_map(headers, seg, self._alternativeDictionary, is_render_internal_fields=True, prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg


        if i == 0:
            logging.getLogger(__name__).info("No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i,gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes))

        fp.close()