Beispiel #1
0
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO")
        for ID, name in table.items():
            configTable.addInfoFieldID(ID, name)

        # Parse fields from FORMAT section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT")
        for ID, name in table.items():
            configTable.addFormatFieldID(ID, name)

        # Parse fields from NOT_SPLIT_TAGS section of the config file
        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS")
        for fieldType, IDs in table.items():
            configTable.addFieldIDsToNotSplitSet(fieldType, IDs)

        # Parse fields from SPLIT_TAGS section of the config file
        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS")
        for fieldType, IDs in table.items():
            configTable.addFieldIDsToSplitSet(fieldType, IDs)
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        # Parse fields from INFO section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "INFO")
        for name, ID in table.items():
            configTable.addInfoFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "FORMAT")
        for name, ID in table.items():
            configTable.addFormatFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "OTHER")
        for name, ID in table.items():
            configTable.addOtherFieldName(name, ID)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "INFO_DESCRIPTION")
        for name, description in table.items():
            configTable.addInfoFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "FORMAT_DESCRIPTION")
        for name, description in table.items():
            configTable.addFormatFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "FILTER_DESCRIPTION")
        for name, description in table.items():
            configTable.addFilterFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToSplitSet(fieldType, names)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "NOT_SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToNotSplitSet(fieldType, names)
Beispiel #3
0
    def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None):
        """
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)

        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not
        #   annotated as part of the INPUT.
        self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False)

        self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True)

        self.exposedColumns = set(self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False

        self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False)
        self._column_collapser = None
        self._column_collapser_suffix = None
        if self._is_collapsing_number_cols:
            self._column_collapser = ColumnCollapser()
            self._column_collapser_suffix = "_full"
 def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
Beispiel #5
0
    def __init__(self,
                 filename,
                 configFile="tcgaMAF2.4_output.config",
                 other_options=None):
        """
        TODO: Need functionality for not prepending the i_ on internal fields.
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)

        #TODO: Read missing options from the config file or specify that error should be thrown.
        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        self.exposedColumns = set(
            self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False
Beispiel #6
0
 def __init__(self,
              mut,
              configFile="sample_name_selection.config",
              section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         config, section)
     self.configFile = configFile
     sampleAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,
                                           tumorAnnotation,
                                           normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation,
                                          tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(
         source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(
         sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
Beispiel #7
0
    def __init__(self,
                 filename,
                 mutation_data_factory=None,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator,
              self).__init__(filename, mutation_data_factory, configFile,
                             genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
Beispiel #8
0
 def __init__(self,
              filename,
              configFile="tcgaVCF1.1_output.config",
              otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        # Parse fields from INFO section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO")
        for name, ID in table.items():
            configTable.addInfoFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT")
        for name, ID in table.items():
            configTable.addFormatFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "OTHER")
        for name, ID in table.items():
            configTable.addOtherFieldName(name, ID)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "INFO_DESCRIPTION")
        for name, description in table.items():
            configTable.addInfoFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FORMAT_DESCRIPTION")
        for name, description in table.items():
            configTable.addFormatFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FILTER_DESCRIPTION")
        for name, description in table.items():
            configTable.addFilterFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToSplitSet(fieldType, names)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToNotSplitSet(fieldType, names)
Beispiel #10
0
 def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config, section)
     self.configFile=configFile
     sampleAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,tumorAnnotation,normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
    def __init__(self,
                 filename,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
    def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
    def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()
        
        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
Beispiel #14
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info(
            "Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = FieldMapCreator.create_field_map(
                    headers,
                    seg,
                    self._alternativeDictionary,
                    is_render_internal_fields=True,
                    prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg

        if i == 0:
            logging.getLogger(__name__).info(
                "No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp,
                                headers,
                                delimiter="\t",
                                lineterminator="\n",
                                extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i, gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." %
                                                 ((i + 1), num_genes))

        fp.close()
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info("Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg


        if i == 0:
            logging.getLogger(__name__).info("No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i,gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes))

        fp.close()