def _process_input_files(self): interactions_file = open(self.args[0], "rb") annotations_file = open(self.args[1], "rb") # Create interaction graph logger.info("Parsing interactions from {0}.".format(interactions_file.name)) self.interactions_graph = parsers.parse_interactions_file_to_graph(interactions_file) logger.info( "{0} genes (products) with {1} interactions " "parsed.".format(len(self.interactions_graph), self.interactions_graph.number_of_edges()) ) # Create dictionary of annotations to genes, but only for genes in # the interaction graph logger.info("Parsing annotations from {0}.".format(annotations_file.name)) if annotations_file.name.endswith(".gmt"): self.annotations_dict = parsers.parse_gmt_to_dict(annotations_file) else: self.annotations_dict = parsers.parse_annotations_to_dict(annotations_file) self.annotations_stats = structures.get_annotations_stats(self.annotations_dict) logger.info( ( "{num_total_annotations} annotations processed, " "for {num_genes} genes (or gene products), by " "{num_annotation_terms} different terms.".format(**self.annotations_stats) ) ) # Remove from the graph the set of nodes that have no annotation. logger.info("Pruning unannotated genes (products) from " "interaction graph.") self.interactions_graph.prune_unannotated_genes(self.annotations_dict) logger.info( "{0} genes (products) with {1} interactions " "remaining in graph.".format(len(self.interactions_graph), self.interactions_graph.number_of_edges()) ) # Remove from the annotations any genes which are not in the graph. logger.info("Removing genes with no interactions from the " "sets of annotated genes.") self.interactions_graph.prune_non_network_genes_from_annotations(self.annotations_dict) self.annotations_stats = structures.get_annotations_stats(self.annotations_dict) logger.info( "{num_total_annotations} annotations, " "for {num_genes} genes (or gene products), by " "{num_annotation_terms} different terms " "remain.".format(**self.annotations_stats) ) # Sanity test: the number of genes (products) in the # interactions_graph should equal the union of all the sets in # annotations_dict assert len(self.interactions_graph) == self.annotations_stats["num_genes"], ( "interactions_graph and annotations_dict have unequal " "numbers of genes!" ) for term, genes in self.annotations_dict.iteritems(): assert len(genes) > 0, "%s has no genes!" % term interactions_file.close() annotations_file.close()
def _process_input_files(self): super(ContextualCli, self)._process_input_files() expression_file = open(self.args[2], "rb") # Get the expression values. logger.info("Parsing expression values from %s." % expression_file.name) expression_values = parsers.parse_expression_file(expression_file) # Apply the expression values to the interaction graph, removing # any nodes lacking expression values from the graph. logger.info("Removing genes without expression values from " "interaction graph and annotation sets.") self.interactions_graph.apply_expression_values_to_interactions_graph(expression_values) # Re-synchronize the interaction graph and annotations dictionary. self.interactions_graph.prune_non_network_genes_from_annotations(self.annotations_dict) expression_file.close() self.annotations_stats = structures.get_annotations_stats(self.annotations_dict) gene_stats = {"num_interactions": self.interactions_graph.number_of_edges()} gene_stats.update(self.annotations_stats) logger.info( "%(num_genes)d genes (products) with " "%(num_interactions)d interactions remaining in " "graph, with %(num_total_annotations)d annotations by " "%(num_annotation_terms)d terms." % gene_stats )