Exemple #1
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = [
            'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
            'FORMAT', 'NORMAL', 'PRIMARY'
        ]

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(
                fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp,
                                   outputHeaders,
                                   delimiter="\t",
                                   lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) +
                         " mutations.  Could not render: " +
                         str(unrenderableRows))
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY']

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) + " mutations.  Could not render: " + str(unrenderableRows))
Exemple #3
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info("Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg


        if i == 0:
            logging.getLogger(__name__).info("No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i,gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes))

        fp.close()
Exemple #4
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        """ Returns a file name pointing to the maf file that is generated. """
        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        self.logger.info("TCGA MAF output file: " + self._filename)
        self.logger.info("Render starting...")

        requiredColumns = self.config.get("general",
                                          "requiredColumns").split(',')
        optionalColumns = self.config.get("general",
                                          "optionalColumns").split(',')

        # Create the header list, making sure to preserve order.
        headers = requiredColumns
        headers.extend(optionalColumns)

        # Create a list of annotation names
        try:
            m = mutations.next()
            annotations = MutUtils.getAllAttributeNames(m)
        except StopIteration as si:

            # There are no mutations, so use the config file and metadata to determine what columns to output
            metadataAnnotations = metadata.keys()
            annotations = set(headers).union(metadataAnnotations)
            m = None

        # Create a mapping between column name and annotation name
        fieldMap = MutUtils.createFieldsMapping(
            headers,
            annotations,
            self.alternativeDictionary,
            self.config.getboolean("general", "displayAnnotations"),
            exposedFields=self.exposedColumns,
            prepend=self._prepend)
        fieldMapKeys = fieldMap.keys()
        internalFields = sorted(list(set(fieldMapKeys).difference(headers)))
        headers.extend(internalFields)

        # Initialize the output file and write a header.
        fp = file(self._filename, 'w')
        fp.write("#version " + self.getTcgaMafVersion() + "\n")

        for c in comments:
            fp.write("## " + c + "\n")

        # Initialize a csv DictWriter
        # Remove headers that start with "_"
        dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n")
        dw.writeheader()
        ctr = 0

        try:
            # Add the NCBI build
            if m is not None:
                m.createAnnotation('ncbi_build',
                                   self.lookupNCBI_Build(m.build),
                                   annotationSource="OUTPUT")
                self._writeMutationRow(dw, fieldMap, fieldMapKeys, m)
                ctr += 1

            for m in mutations:

                # Add the NCBI build
                m.createAnnotation('ncbi_build',
                                   self.lookupNCBI_Build(m.build),
                                   annotationSource="OUTPUT")
                self._writeMutationRow(dw, fieldMap, fieldMapKeys, m)

                # Update mutation count and log every 1000 mutations
                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Rendered " + str(ctr) + " mutations.")
        except Exception as e:
            import traceback
            self.logger.error(traceback.format_exc())
            self.logger.error(
                "Error at mutation " + str(ctr) + " " +
                str([m.chr, m.start, m.end, m.ref_allele, m.alt_allele]) +
                ": ")
            self.logger.error("Incomplete: rendered %d mutations." % (ctr))
            fp.close()
            raise e

        fp.close()
        if self._is_entrez_id_message_logged:
            logging.getLogger(__name__).warn(
                "Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF."
            )
        self.logger.info("Rendered all " + str(ctr) + " mutations.")
        return self._filename