def parse_annotations_to_dict(annotations_fileh): """Parse a CSV annotations file to a dictionary. The annotations file should have a column titled "gene_id" which has the gene/gene product ID, and a column titled "term" which contains the name or ID of a term by which the gene/product is annotated. The file may have additional columns, which will be ignored. Returns a `convstructs.TwoWaySetDict` instance with annotation as strings and `set`s of genes as values. :Parameters: - `annotations_fileh`: a CSV file with a header line as the first line """ annotations_dict = convstructs.TwoWaySetDict() csv_reader = convutils.make_csv_reader(annotations_fileh) for entry in csv_reader: gene = entry["gene_id"] term = entry["term"] if term in annotations_dict: annotations_dict.add_item(term, gene) else: annotations_dict[term] = set([gene]) return annotations_dict
def parse_interactions_file_to_graph(interactions_fileh): """Parse a CSV interactions file to a graph. The interactions file should have two columns with headings "interactor1" and "interactor2". If it contains an additional column with header "weight", values in that column will be used as the weight or "confidence" in the interaction. The file may have additional columns, which will be ignored. Returns a graph with genes/gene products as nodes and interactions as (weighted) edges. :Parameters: - `interactions_fileh`: a CSV file with a header line as the first line """ interactions_graph = structures.EdgeSwapGraph() csv_reader = convutils.make_csv_reader(interactions_fileh) for entry in csv_reader: node1 = entry["interactor1"] node2 = entry["interactor2"] if "weight" in entry: weight = float(entry["weight"]) interactions_graph.add_edge(node1, node2, weight=weight) else: interactions_graph.add_edge(node1, node2, weight=1) return interactions_graph
def parse_selected_links_file(selected_links_fileh): """Parse a CSV pairs file to an iterator of links. The file should have no header and only two columns, where the annotation in the first column needs to be tested if it is "linked to" the annotation in the second column. NOTE: This is a generator; it will yield links until the file is completely consumed. :Parameters: - `selected_links_fileh`: a CSV file of two columns and no headers with annotations in the columns """ csv_reader = convutils.make_csv_reader(selected_links_fileh, False) for i, link in enumerate(csv_reader): assert len(link) == 2, "Line %d has fewer or greater than " "two annotation entries." % i + 1 yield tuple(link)
def parse_expression_file(expression_fileh): """Parse a CSV expression file. Returns a dictionary with gene (product) identifiers as keys and expression values as values. :Parameters: - `expression_fileh`: a CSV file of gene (or gene product) expression values. The file should have a column titled "id" which has the gene (or gene product) ID, and a column titled "expression" which gives a value for the expression level, or difference in expression levels. """ csv_file = convutils.make_csv_reader(expression_fileh) expressions_dict = {} warned_of_multiple_values = False for i, entry in enumerate(csv_file): expression_value = float(entry["expression"]) if entry["id"] in expressions_dict: # We've already seen an entry for this ID if not warned_of_multiple_values: logger.warning( "WARNING! Multiple expression values " "detected for at least one gene; continuing " "anyway." ) warned_of_multiple_values = True # msg = ("Warning: on line %d: %s has already been seen; " # "continuing anyway" % (i + 1, entry['id'])) # logger.warning(msg) # msg = "Error on line %d: %s has already been seen" % ( # i + 1, entry['id']) # raise DuplicateIDError(msg) if expression_value > expressions_dict[entry["id"]]: expressions_dict[entry["id"]] = expression_value else: expressions_dict[entry["id"]] = expression_value return expressions_dict
def parse_set_significances(significances_fileh): """Parses the set significances from a CSV file. Returns a dictionary with set names as keys and their significances as a list of floats. :Parameters: - `significances_fileh`: A CSV file containing gene set significances. The file should contain a header row, which will be ignored. The first column should contain the name of each gene set; each column following should correspond with that gene set's significance in each file, in order with respect to the input order on the command line. """ csv_reader = convutils.make_csv_reader(significances_fileh, headers=False) csv_reader.next() set_significances = {} for entry in csv_reader: significances = [(get_sign(value), abs(float(value))) for value in entry[1:]] set_significances[entry[0]] = significances return set_significances
def results_to_edges_and_stats( csv_fileh, significance_cutoff, annotation1_column_title, annotation2_column_title, significance_column_title, annotation1_size_column_title, annotation1_neighbors_size_column_title, annotation2_size_column_title, selected_annotations, less_than=True ): """Reads results from a CSV file and converts them into edges. Returns a dictionary with annotation pairs as the keys and their p-values as values. :Parameters: - `csv_fileh`: a CSV file with column headers - `significance_cutoff`: a value to use as the threshold for including an edge - `annotation1_column_title`: title of the column containing the name of the first annotation - `annotation2_column_title`: title of the column containing the name of the second annotation - `significance_column_title`: title of the column containing significance values (e.g., `'pvalue'`) - `annotation1_size_column_title`: title of the column containing the size of the set of genes annotated by the first annotation - `annotation1_neighbors_size_column_title`: title of the column containing the size of the set of genes neighboring those annotated by the first annotation - `annotation2_size_column_title`: title of the column containing the size of the set of genes annotated by the second annotation - `selected_annotations`: a `set` of annotations to which interactions should be restricted - `less_than`: whether the significance should be less than or equal to the `significance_cutoff` to be considered significant, or be greater than or equal to the `significance_cutoff` [default: `True`] """ csv_reader = convutils.make_csv_reader(csv_fileh) edges = {} annotation_stats = {} for entry in csv_reader: significance = float(entry[significance_column_title]) if less_than: significant = significance <= significance_cutoff else: significant = significance >= significance_cutoff if significant: annotation1 = entry[annotation1_column_title] annotation2 = entry[annotation2_column_title] if selected_annotations is not None: # If we are restricting the annotations to include, skip # the rest of this if either of the annotation terms # aren't in the set of selected terms. if (not annotation1 in selected_annotations) or (not annotation2 in selected_annotations): continue pair_key = (annotation1, annotation2) # TODO: Commented out for mcmcbpn hack, fix #annotation1_size = entry[annotation1_size_column_title] #annotation1_neighbors_size = \ #entry[annotation1_neighbors_size_column_title] #annotation2_size = entry[annotation2_size_column_title] ## We should probably check if these stats have already been ## put in the annotations_stats dictionary, but I'm lazy and ## they *should* be consistent. #if annotation1 not in annotation_stats: #annotation_stats[annotation1] = { #'size': int(annotation1_size), #'neighbors': int(annotation1_neighbors_size) #} #else: #annotation_stats[annotation1]['size'] = \ #int(annotation1_size) #annotation_stats[annotation1]['neighbors'] = \ #int(annotation1_neighbors_size) #if annotation2 not in annotation_stats: #annotation_stats[annotation2] = { #'size': annotation2_size #} #else: #annotation_stats[annotation2]['size'] = \ #int(annotation2_size) ## Keep only the edge statistics in the dictionary #del entry[annotation1_column_title] #del entry[annotation2_column_title] #del entry[annotation1_size_column_title] #del entry[annotation1_neighbors_size_column_title] #del entry[annotation2_size_column_title] edges[pair_key] = entry return edges, annotation_stats