Example #1
0
 def test_not_updating_annotation_source(self):
     """Test that do not have to update annotation source if columns are collapsed"""
     m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
     m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST")
     cc = ColumnCollapser()
     cc.update_mutation(m1)
     self.assertEqual(m1.getAnnotation("ALT_F2R1").getDatasource(), "TEST")
Example #2
0
    def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None):
        """
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)

        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not
        #   annotated as part of the INPUT.
        self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False)

        self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True)

        self.exposedColumns = set(self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False

        self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False)
        self._column_collapser = None
        self._column_collapser_suffix = None
        if self._is_collapsing_number_cols:
            self._column_collapser = ColumnCollapser()
            self._column_collapser_suffix = "_full"
Example #3
0
    def test_simple_collapse(self):
        """Ensure simple rules for numeric collapsing are honored"""
        m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
        m1.createAnnotation('ALT_F2R1', "34|36")
        m1.createAnnotation('i_t_Foxog', ".509|.511")
        m1.createAnnotation('i_tumor_f', ".200|.210")
        m1.createAnnotation('hamilcar', "0|0")
        m1.createAnnotation('donotcollapse', "1|45")

        m2 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
        m2.createAnnotation('ALT_F2R1', "36|38")
        m2.createAnnotation('i_t_Foxog', ".500|.510")
        m2.createAnnotation('i_tumor_f', ".100|.110")
        m2.createAnnotation('hamilcar', "0.01|0")
        m2.createAnnotation('barca', "0.02|0")
        m2.createAnnotation('donotcollapse', "100|4500")

        cc = ColumnCollapser()
        cc.update_mutation(m1)
        self.assertEqual(m1['ALT_F2R1'], "34")
        self.assertEqual(float(m1['i_t_Foxog']), float(".510"))
        self.assertEqual(float(m1['i_tumor_f']), float(".205"))
        self.assertEqual(float(m1['hamilcar']), float("0"))
        self.assertEqual(m1['donotcollapse'], "1|45")

        cc.update_mutation(m2)
        self.assertEqual(m2['ALT_F2R1'], "36")
        self.assertEqual(float(m2['i_t_Foxog']), float(".505"))
        self.assertEqual(float(m2['i_tumor_f']), float(".105"))
        self.assertEqual(float(m2['hamilcar']), float("0.005"))
        self.assertEqual(float(m2['barca']), float("0.01"))
        self.assertEqual(m2['donotcollapse'], "100|4500")
Example #4
0
 def test_not_updating_annotation_source(self):
     """Test that do not have to update annotation source if columns are collapsed"""
     m1 = MutationDataFactory.default_create(chr="1",
                                             start="10000",
                                             end="10000")
     m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST")
     cc = ColumnCollapser()
     cc.update_mutation(m1)
     self.assertEqual(m1.getAnnotation("ALT_F2R1").getDatasource(), "TEST")
Example #5
0
 def test_annotation_copy(self):
     """Test that we can create a backup annotation with the old values after collapsing, if requested."""
     m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
     m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST")
     cc = ColumnCollapser()
     cc.update_mutation(m1, new_annotation_source="foo", copy_old_suffix="_full")
     self.assertEqual(m1["ALT_F2R1_full"], "|36")
     self.assertEqual(m1["ALT_F2R1"], "36")
     self.assertEqual(m1.getAnnotation("ALT_F2R1_full").getDatasource(), "TEST")
     self.assertTrue(m1.getAnnotation("ALT_F2R1").getDatasource() != m1.getAnnotation("ALT_F2R1_full").getDatasource())
Example #6
0
    def test_annotation_copy_collision(self):
        """Test that annotation copy will use the bahavior of the mutation in case of collision due to suffix"""
        m1 = MutationDataFactory.default_create(chr="1",
                                                start="10000",
                                                end="10000")
        m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST")
        m1.createAnnotation('ALT_F2R1_full',
                            "going_to_be_overwritten",
                            annotationSource="TEST")

        is_exception_seen = False
        cc = ColumnCollapser()
        try:
            cc.update_mutation(m1, copy_old_suffix="_full")
        except DuplicateAnnotationException as dae:
            is_exception_seen = True
        self.assertTrue(is_exception_seen,
                        "Did not see duplicate annotation exception")

        m1 = MutationDataFactory.default_create(chr="1",
                                                start="10000",
                                                end="10000",
                                                allow_overwriting=True)
        m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST")
        m1.createAnnotation('ALT_F2R1_full',
                            "going_to_be_overwritten",
                            annotationSource="TEST")
        cc = ColumnCollapser()
        cc.update_mutation(m1, copy_old_suffix="_full")
        self.assertEqual(m1['ALT_F2R1_full'], "30|36")
        self.assertEqual(m1['ALT_F2R1'], "30")
Example #7
0
 def test_annotation_copy(self):
     """Test that we can create a backup annotation with the old values after collapsing, if requested."""
     m1 = MutationDataFactory.default_create(chr="1",
                                             start="10000",
                                             end="10000")
     m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST")
     cc = ColumnCollapser()
     cc.update_mutation(m1,
                        new_annotation_source="foo",
                        copy_old_suffix="_full")
     self.assertEqual(m1["ALT_F2R1_full"], "|36")
     self.assertEqual(m1["ALT_F2R1"], "36")
     self.assertEqual(
         m1.getAnnotation("ALT_F2R1_full").getDatasource(), "TEST")
     self.assertTrue(
         m1.getAnnotation("ALT_F2R1").getDatasource() != m1.getAnnotation(
             "ALT_F2R1_full").getDatasource())
Example #8
0
    def test_cannot_collapse(self):
        """Make sure that we move on when we cannot collapse values."""
        m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
        m1.createAnnotation('ALT_F2R1', "|36")
        m1.createAnnotation('i_t_Foxog', "|")
        m1.createAnnotation('i_tumor_f', "")
        m1.createAnnotation('hamilcar', "0|blah")
        m1.createAnnotation('barca', "carthage_rules")
        m1.createAnnotation('donotcollapse', "1|45")

        cc = ColumnCollapser()
        cc.update_mutation(m1)
        self.assertEqual(m1['ALT_F2R1'], "36")
        self.assertEqual(m1['i_t_Foxog'], "")
        self.assertEqual(m1['i_tumor_f'], "")
        self.assertEqual(m1['hamilcar'], "0|blah")
        self.assertEqual(m1['barca'], "carthage_rules")
        self.assertEqual(m1['donotcollapse'], "1|45")
Example #9
0
    def test_cannot_collapse(self):
        """Make sure that we move on when we cannot collapse values."""
        m1 = MutationDataFactory.default_create(chr="1",
                                                start="10000",
                                                end="10000")
        m1.createAnnotation('ALT_F2R1', "|36")
        m1.createAnnotation('i_t_Foxog', "|")
        m1.createAnnotation('i_tumor_f', "")
        m1.createAnnotation('hamilcar', "0|blah")
        m1.createAnnotation('barca', "carthage_rules")
        m1.createAnnotation('donotcollapse', "1|45")

        cc = ColumnCollapser()
        cc.update_mutation(m1)
        self.assertEqual(m1['ALT_F2R1'], "36")
        self.assertEqual(m1['i_t_Foxog'], "")
        self.assertEqual(m1['i_tumor_f'], "")
        self.assertEqual(m1['hamilcar'], "0|blah")
        self.assertEqual(m1['barca'], "carthage_rules")
        self.assertEqual(m1['donotcollapse'], "1|45")
Example #10
0
    def test_annotation_copy_collision(self):
        """Test that annotation copy will use the bahavior of the mutation in case of collision due to suffix"""
        m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000")
        m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST")
        m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST")

        is_exception_seen = False
        cc = ColumnCollapser()
        try:
            cc.update_mutation(m1, copy_old_suffix="_full")
        except DuplicateAnnotationException as dae:
            is_exception_seen = True
        self.assertTrue(is_exception_seen, "Did not see duplicate annotation exception")

        m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000", allow_overwriting=True)
        m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST")
        m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST")
        cc = ColumnCollapser()
        cc.update_mutation(m1, copy_old_suffix="_full")
        self.assertEqual(m1['ALT_F2R1_full'], "30|36")
        self.assertEqual(m1['ALT_F2R1'], "30")
Example #11
0
    def test_simple_collapse(self):
        """Ensure simple rules for numeric collapsing are honored"""
        m1 = MutationDataFactory.default_create(chr="1",
                                                start="10000",
                                                end="10000")
        m1.createAnnotation('ALT_F2R1', "34|36")
        m1.createAnnotation('i_t_Foxog', ".509|.511")
        m1.createAnnotation('i_tumor_f', ".200|.210")
        m1.createAnnotation('hamilcar', "0|0")
        m1.createAnnotation('donotcollapse', "1|45")

        m2 = MutationDataFactory.default_create(chr="1",
                                                start="10000",
                                                end="10000")
        m2.createAnnotation('ALT_F2R1', "36|38")
        m2.createAnnotation('i_t_Foxog', ".500|.510")
        m2.createAnnotation('i_tumor_f', ".100|.110")
        m2.createAnnotation('hamilcar', "0.01|0")
        m2.createAnnotation('barca', "0.02|0")
        m2.createAnnotation('donotcollapse', "100|4500")

        cc = ColumnCollapser()
        cc.update_mutation(m1)
        self.assertEqual(m1['ALT_F2R1'], "34")
        self.assertEqual(float(m1['i_t_Foxog']), float(".510"))
        self.assertEqual(float(m1['i_tumor_f']), float(".205"))
        self.assertEqual(float(m1['hamilcar']), float("0"))
        self.assertEqual(m1['donotcollapse'], "1|45")

        cc.update_mutation(m2)
        self.assertEqual(m2['ALT_F2R1'], "36")
        self.assertEqual(float(m2['i_t_Foxog']), float(".505"))
        self.assertEqual(float(m2['i_tumor_f']), float(".105"))
        self.assertEqual(float(m2['hamilcar']), float("0.005"))
        self.assertEqual(float(m2['barca']), float("0.01"))
        self.assertEqual(m2['donotcollapse'], "100|4500")
Example #12
0
class TcgaMafOutputRenderer(OutputRenderer):
    """
    
    Render a generator or list of mutations into a TCGA MAF file.  
    
    TCGA MAF specification can be found at: https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification
    
    Version specified in the config file in the "general" section.
    """
    def getTcgaMafVersion(self):
        return self.config.get("general", "version")

    OUTPUT_T_REF_COUNT = 't_ref_count'
    OUTPUT_T_ALT_COUNT = 't_alt_count'

    def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None):
        """
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)

        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not
        #   annotated as part of the INPUT.
        self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False)

        self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True)

        self.exposedColumns = set(self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False

        self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False)
        self._column_collapser = None
        self._column_collapser_suffix = None
        if self._is_collapsing_number_cols:
            self._column_collapser = ColumnCollapser()
            self._column_collapser_suffix = "_full"

    def lookupNCBI_Build(self, build):
        """ If a build number exists in the config file, use that.  Otherwise, use the name specified. """
        if not self.config.has_option("genomeBuild", build):
            return build
        self.config.get("genomeBuild", build, vars={"genomeBuild":build})
    
    def _createMutationRow(self, m, headers, fieldMapping):
        """ Create a single mutation dictionary (i.e. render a line).  A dictionary as per the csv library.
        Headers will usually be the fieldMapping keys, but extra parameter is provided here in case subset is desired.
        Also, allows caching of the keys ahead of time. """
        row = dict()
        for h in headers:
            annotation = fieldMapping[h]
            value = m.get(annotation, "__UNKNOWN__")
            row[h] = value
        return row

    def _determine_new_allele_if_blank(self, d, allele_key, new_value):
        """

        :param d: dictionary of column names
        :param allele_key: key to replace if "" or does not exist.
        :param new_value: value to use if "" or does not exist
        :return:
        """
        result = d.get(allele_key, new_value)
        if result.strip() == "":
            result = new_value
        return result

    def _update_validation_values(self, row):
        """ If Validation_Status  == "Valid" then
          Tumor_Validation_Allele1, Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2 cannot  be null
         If Mutation_Status == "Somatic" and Validation_Status == "Valid", then
          Match_Norm_Validation_Allele1 == Match_Norm_Validation_Allele2 == Reference_Allele and (Tumor_Validation_Allele1 or Tumor_Validation_Allele2) != Reference_Allele

         If Validation_Status == "Invalid" then
          Tumor_Validation_Allele1, Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2 cannot be null AND Tumor_Validation_Allelle1 == Match_Norm_Validation_Allele1 AND Tumor_Validation_Allelle2 == Match_Norm_Validation_Allele2  (Added as a replacement for 8a as a result of breakdown)


        IMPORTANT: The input parameter is altered.

        :param row: dict with name value pairs that include the TCGA MAF columns.  This is usually not the mutation.
        """
        if row['Validation_Status'] == "Valid":
            if row['Mutation_Status'] == "Somatic":
                row['Tumor_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele1',
                                                                                      row['Reference_Allele'])
                row['Tumor_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele2',
                                                                                      row['Tumor_Seq_Allele2'])
                row['Match_Norm_Validation_Allele1'] = self._determine_new_allele_if_blank(row,
                                                                                           'Match_Norm_Validation_Allele1',
                                                                                           row['Reference_Allele'])
                row['Match_Norm_Validation_Allele2'] = self._determine_new_allele_if_blank(row,
                                                                                           'Match_Norm_Validation_Allele2',
                                                                                           row['Reference_Allele'])

        if row['Validation_Status'] == "Invalid":

            # Only valid mutation status value is None for an invalid mutation
            if row['Mutation_Status'] != "None":
                row['Mutation_Status'] = "None"

            # If the alleles are blank, populate properly for invalid mutation.  Basically, everything becomes reference
            row['Match_Norm_Validation_Allele1'] = self._determine_new_allele_if_blank(row,
                                                                                       'Match_Norm_Validation_Allele1',
                                                                                       row['Reference_Allele'])
            row['Match_Norm_Validation_Allele2'] = self._determine_new_allele_if_blank(row,
                                                                                       'Match_Norm_Validation_Allele2',
                                                                                       row['Reference_Allele'])
            row['Tumor_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele1',
                                                                                  row['Match_Norm_Validation_Allele1'])
            row['Tumor_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele2',
                                                                                  row['Match_Norm_Validation_Allele2'])


    def _writeMutationRow(self, dw, fieldMap, fieldMapKeys, m):
        """ If this row should be rendered, then write it to the given DictWriter

        Additionally, apply corrections needed to make this a valid TCGA MAF.

        This method must be called as a last step before writing the output, as it relies on the output row, as opposed
            to the annotated mutation.

        :param dw: DictWriter
        :param fieldMap:
        :param fieldMapKeys:
        :param m:
        :return:
        """
        row = self._createMutationRow(m, fieldMapKeys, fieldMap)

        # Use HGNC Entrez Gene ID, if available and nothing else has populated it.,
        if row['Entrez_Gene_Id'] == "" and m.get('HGNC_Entrez Gene ID(supplied by NCBI)', "") != "":
            row['Entrez_Gene_Id'] = m.get('HGNC_Entrez Gene ID(supplied by NCBI)')

        if row['Entrez_Gene_Id'] == "":
            row['Entrez_Gene_Id'] = "0"

        if not self._is_entrez_id_message_logged and row['Entrez_Gene_Id'] == "0" and row['Hugo_Symbol'] != "Unknown":
            logging.getLogger(__name__).warn("Entrez Gene ID was zero, but Hugo Symbol was not Unknown.  Is the HGNC and/or Transcript datasource complete?")
            self._is_entrez_id_message_logged = True
        self._update_validation_values(row)

        # Handle the splitting of allelic depth
        if row.get('allelic_depth', "").strip() != "" and self._is_splitting_allelic_depth:
            vals = row.get('allelic_depth', "").split(",")
            ref_count = vals[0]
            alt_count = vals[1]
            row[TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT] = alt_count
            row[TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT] = ref_count

        dw.writerow(row)

    def _add_output_annotations(self, m):
        """Add annotations specific to the TCGA MAF
        """
        m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT")
        if self._is_splitting_allelic_depth and m.get('allelic_depth', "").strip() != "":
            # Handle the splitting of allelic depth
            vals = m.get('allelic_depth', "").split(",")
            ref_count = vals[0]
            alt_count = vals[1]
            m.createAnnotation(TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, alt_count, "OUTPUT")
            m.createAnnotation(TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT, ref_count, "OUTPUT")

        if self._is_collapsing_number_cols:
            self._column_collapser.update_mutation(m, "OUTPUT", self._column_collapser_suffix)

    def renderMutations(self, mutations, metadata=None, comments=None):
        """ Returns a file name pointing to the maf file that is generated. """
        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        self.logger.info("TCGA MAF output file: " + self._filename)
        self.logger.info("Render starting...")

        requiredColumns = self.config.get("general", "requiredColumns").split(',')
        optionalColumns = self.config.get("general", "optionalColumns").split(',')

        # Create the header list, making sure to preserve order.
        headers = requiredColumns
        headers.extend(optionalColumns)

        # Create a list of annotation names
        try:
            m = mutations.next()
            annotations = MutUtils.getAllAttributeNames(m)
        except StopIteration as si:

            # There are no mutations, so use the config file and metadata to determine what columns to output
            metadataAnnotations = metadata.keys()
            annotations = set(headers).union(metadataAnnotations)
            m = None

        # If we are splitting allelic_depth into two fields, add those to the headers.  Note that the mutations will
        #  be annotated properly later.
        if self._is_splitting_allelic_depth and "allelic_depth" in annotations:
            depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT]
            headers.extend(depth_fields)

        if m is not None:

            # Add columns for the new annotations created as part of collapsing cols
            additional_internal_columns = []
            if self._column_collapser is not None:
                additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix)

            # Create a mapping between column name and annotation name
            field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary,
                                                    self.config.getboolean("general", "displayAnnotations"),
                                                    exposed_fields=self.exposedColumns, prepend=self._prepend,
                                                    deprioritize_input_annotations=self._is_reannotating,
                                                    additional_columns=additional_internal_columns)

            field_map_keys = field_map.keys()
            internal_fields = sorted(list(set(field_map_keys).difference(headers)))
            headers.extend(internal_fields)

        # Initialize the output file and write a header.
        fp = file(self._filename, 'w')
        fp.write("#version " + self.getTcgaMafVersion() + "\n")
        
        for c in comments:
            fp.write("## " + c + "\n")
        
        # Initialize a csv DictWriter
        # Remove headers that start with "_"
        dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n")
        dw.writeheader()
        ctr = 0

        try:
            # Add the NCBI build
            if m is not None:
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                ctr += 1

            for m in mutations:

                # Add the NCBI build
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                
                # Update mutation count and log every 1000 mutations
                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Rendered " + str(ctr) + " mutations.")
        except Exception as e:
            import traceback
            self.logger.error(traceback.format_exc())
            self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ")
            self.logger.error("Incomplete: rendered %d mutations." % (ctr))
            fp.close()
            raise e
        
        fp.close()
        if self._is_entrez_id_message_logged:
            logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.")
        self.logger.info("Rendered all " + str(ctr) + " mutations.")
        return self._filename