def test_reannotating_with_prepends(self): """Test that we will disregard the prepend when looking for fields to write""" m = MutationDataFactory.default_create() m.createAnnotation('i_foo', "blah", "INPUT") m.createAnnotation('foo', "bloop", "some datasource") headers = ['i_foo'] alt_dict = {'i_foo': ['i_i_foo', 'foo']} mapping = FieldMapCreator.create_field_map(headers, m, alt_dict, is_render_internal_fields=True,deprioritize_input_annotations=True) self.assertTrue(mapping['i_foo'] == 'foo') mapping = FieldMapCreator.create_field_map(headers, m, alt_dict, is_render_internal_fields=True,deprioritize_input_annotations=False) self.assertTrue(mapping['i_foo'] == 'i_foo')
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY' ] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() self.fieldMap = FieldMapCreator.create_field_map( fieldsUsed, m, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY'] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() self.fieldMap = FieldMapCreator.create_field_map(fieldsUsed, m, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def renderMutations(self, mutations, metadata=None, comments=None): """ Returns a file name pointing to the maf file that is generated. """ if metadata is None: metadata = OrderedDict() if comments is None: comments = [] self.logger.info("TCGA MAF output file: " + self._filename) self.logger.info("Render starting...") requiredColumns = self.config.get("general", "requiredColumns").split(',') optionalColumns = self.config.get("general", "optionalColumns").split(',') # Create the header list, making sure to preserve order. headers = requiredColumns headers.extend(optionalColumns) # Create a list of annotation names try: m = mutations.next() annotations = MutUtils.getAllAttributeNames(m) except StopIteration as si: # There are no mutations, so use the config file and metadata to determine what columns to output metadataAnnotations = metadata.keys() annotations = set(headers).union(metadataAnnotations) m = None # If we are splitting allelic_depth into two fields, add those to the headers. Note that the mutations will # be annotated properly later. if self._is_splitting_allelic_depth and "allelic_depth" in annotations: depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT] headers.extend(depth_fields) if m is not None: # Add columns for the new annotations created as part of collapsing cols additional_internal_columns = [] if self._column_collapser is not None: additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix) # Create a mapping between column name and annotation name field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary, self.config.getboolean("general", "displayAnnotations"), exposed_fields=self.exposedColumns, prepend=self._prepend, deprioritize_input_annotations=self._is_reannotating, additional_columns=additional_internal_columns) field_map_keys = field_map.keys() internal_fields = sorted(list(set(field_map_keys).difference(headers))) headers.extend(internal_fields) # Initialize the output file and write a header. fp = file(self._filename, 'w') fp.write("#version " + self.getTcgaMafVersion() + "\n") for c in comments: fp.write("## " + c + "\n") # Initialize a csv DictWriter # Remove headers that start with "_" dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n") dw.writeheader() ctr = 0 try: # Add the NCBI build if m is not None: self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) ctr += 1 for m in mutations: # Add the NCBI build self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) # Update mutation count and log every 1000 mutations ctr += 1 if (ctr % 1000) == 0: self.logger.info("Rendered " + str(ctr) + " mutations.") except Exception as e: import traceback self.logger.error(traceback.format_exc()) self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ") self.logger.error("Incomplete: rendered %d mutations." % (ctr)) fp.close() raise e fp.close() if self._is_entrez_id_message_logged: logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.") self.logger.info("Rendered all " + str(ctr) + " mutations.") return self._filename
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info( "Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = FieldMapCreator.create_field_map( headers, seg, self._alternativeDictionary, is_render_internal_fields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info( "No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i, gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i + 1), num_genes)) fp.close()
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info("Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = FieldMapCreator.create_field_map(headers, seg, self._alternativeDictionary, is_render_internal_fields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info("No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i,gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes)) fp.close()