def __init__(self): databases = Databases() self.reaction_to_ko = databases.r2k() self.compound_to_reaction = databases.c2r() self.compounds = databases.c() self.positive = 'positive' self.negative = 'negative' self.abundace = "frequency_matrix.tsv" self.enrichment = "enrichment_results.tsv" self.abundace_header = ["Compound"] self.enrichment_header = ["Compound", "Group_1", "Group_2", "group_1_mean", "group_2_mean", "score", "pvalue", "description"]
def __init__(self, metadata, abundances_metagenome, abundances_transcriptome, abundances_metabolome, fisher_results): self.abundances_metagenome = abundances_metagenome self.abundances_transcriptome = abundances_transcriptome self.abundances_metabolome = abundances_metabolome self.fisher_results = fisher_results self.metadata_keys = list(metadata.keys()) databases = Databases() self.reaction_to_module = databases.r2m() self.module_to_reaction = databases.m2r() self.module_descriptions = databases.m() self.reaction_to_pathway = databases.r2p() self.pathway_to_reaction = databases.p2r() self.pathway_descriptions = databases.p() self.compound_desc_dict = databases.compound_desc_dict() self.compound_descriptions = databases.c() self.reaction_descriptions = databases.r() self.reactions_to_compounds = databases.r2c() self.reactions_to_kos = databases.r2k() self.matrix_header = ["compound", "reaction", 'type'] self.transcriptome_header = [ key + '_reaction_transcriptome' for key in self.metadata_keys ] self.compound_header = [ key + '_compound' for key in self.metadata_keys ] self.metadata_header = [ 'node', 'description', 'type', 'module', 'module_descr', 'pathway', 'pathway_descr', 'node_type' ] self.query_header = ['query', 'step'] self.step_header = ['step'] self.to_omit = set([ "C00828", # Menaquinone "C00534", # Pyridoxamine "C00006", # NADP+ "C00003", # NAD+ "C00002", # ATP "C00314", # Pyridoxine "C00864", # Pantothenate "C00504", # Folate "C00032", # Heme "C05443", # Vitamin D3 "C00253", # Nicotinate "C00250", # Pyridoxal "C11378", # Ubiquinone-10 "C05777", # Coenzyme F430 "C00072", # Ascorbate "C00378", # Thiamine "C00101", # Tetrahydrofolate "C00029", # UDP-glucose "C00068", # Thiamin diphosphate "C00061", # FMN "C00063", # CTP "C05776", # Vitamin B12 "C00113", # PQQ "C18237", # Molybdoenzyme molybdenum cofactor "C00051", # Glutathione "C00010", # CoA "C00016", # FAD "C00018", # Pyridoxal phosphate "C00019", # S-Adenosyl-L-methionine "C00153", # Nicotinamide "C04628", # Coenzyme B "C00862", # Methanofuran "C15672", # Heme O "C15670", # Heme A "C02059", # Phylloquinone "C03576", # Coenzyme M "C05441", # Vitamin D2 "C00272", # Tetrahydrobiopterin "C02477", # alpha-Tocopherol "C00473", # Retinol "C00120", # Biotin "C00725", # Lipoate "C00053", # 3'-Phosphoadenylyl sulfate "C00194", # Cobamide coenzyme "C00255", # Riboflavin 'C00001', # H2O 'C00008', # ADP 'C00013', # Diphosphate 'C00004', # NADH 'C00005', # NADPH 'C00080', # H+ 'C00009', # Orthophosphate 'C00008', # ADP 'C00004', # NADH 'C00020', # AMP 'C00007', # Oxygen 'C00015' ]) # UDP
class NetworkAnalyser: """ Prepare metagenome, metatranscriptome, metabolomic data for constructing SIF network files. """ MATRIX = 'matrix' NETWORK = 'network' EXPLORE = 'explore' DEGRADE = 'degrade' PATHWAY = 'pathway' ANNOTATE = 'annotate' ENRICHMENT = 'enrichment' MODULE_AB = 'module_ab' TRAVERSE = 'traverse' NETWORK_OUTPUT_FILE = 'network.tsv' METADATA_OUTPUT_FILE = 'metadata.tsv' TRAVERSE_OUTPUT_FILE = 'traverse.tsv' def __init__(self): self.databases = Databases() self.reactions = self.databases.r() self.reaction_to_ko = self.databases.r2k() def average(self, input_dictionary): ''' Take the average of the values of a dictionary of dictionaries ''' for sample_group, group_dict in input_dictionary.items(): for group, reaction_dict in group_dict.items(): for reaction, value in reaction_dict.items(): input_dictionary[sample_group][group][reaction] = sum(value) / len(value) return input_dictionary def median_genome_abundance(self, sample_abundance_dict, sample_metadata): """ Create a dictionary with the median abundance in sample_abundance_dict using sample_metadata as a reference. """ median_sample_abundance = dict() for group, samples in sample_metadata.items(): median_sample_abundance[group] = dict() sample_dictionaries = [sample_abundance_dict[sample] for sample in samples] genomes = set(list(itertools.chain(*[list(sample_dictionary.keys()) for sample_dictionary in sample_dictionaries]))) for genome in genomes: abundances = [sample_dictionary[genome] for sample_dictionary in sample_dictionaries] median_sample_abundance[group][genome] = statistics.median(abundances) return median_sample_abundance def normalise_by_abundance(self, median_sample_abundances, reaction_abundance_dict, group_to_genome, genome_to_group, genome_groups): normalised_abundance_dict = dict() for sample_group in list(median_sample_abundances.keys()): normalised_abundance_dict[sample_group] = dict() for genome_group in genome_groups: normalised_abundance_dict[sample_group][genome_group] = dict() for sample_group, genome_abundances in median_sample_abundances.items(): for genome, genome_abundance in genome_abundances.items(): if(genome in genome_to_group and genome in reaction_abundance_dict): for reaction in list(reaction_abundance_dict[genome].keys()): normalised_value = reaction_abundance_dict[genome][reaction]*genome_abundance genome_group = next(iter(genome_to_group[genome])) if reaction in normalised_abundance_dict[sample_group][genome_group]: normalised_abundance_dict[sample_group][genome_group][reaction].append( normalised_value ) else: normalised_abundance_dict[sample_group][genome_group][reaction] = [normalised_value] return normalised_abundance_dict def average_tpm_by_sample(self, tpm_results, sample_metadata): output_dict = dict() tpm_dict, annotations, genomes = tpm_results for group, samples in sample_metadata.items(): output_dict[group] = dict() for sample in samples: for annotation in annotations: if str.encode(sample) in tpm_dict: for genome in genomes: if genome not in output_dict[group]: output_dict[group][genome] = dict() if annotation not in output_dict[group][genome]: output_dict[group][genome][annotation] = list() if genome in tpm_dict[str.encode(sample)]: if annotation in tpm_dict[str.encode(sample)][genome]: output_dict[group][genome][annotation].append(tpm_dict[str.encode(sample)][genome][annotation]) else: output_dict[group][genome][annotation].append(0.0) else: output_dict[group][genome][annotation].append(0.0) for genome, values in output_dict[group].items(): for annotation in values: output_dict[group][genome][annotation] = sum(output_dict[group][genome][annotation])/len(output_dict[group][genome][annotation]) return output_dict def average_tpm_values(self, transriptome_abundance_dict, group_metadata): output_dict = dict() reactions = list(self.reactions.keys()) for genome_group_name, group_reaction_abundance_dict in transriptome_abundance_dict.items(): output_dict[genome_group_name] = dict() for group, members in group_metadata.items(): output_dict[genome_group_name][group] = dict() for reaction in reactions: to_average = list() for member in members: if member in group_reaction_abundance_dict: if str.encode(reaction) in group_reaction_abundance_dict[member]: to_average.append(group_reaction_abundance_dict[member][str.encode(reaction)]) else: to_average.append(0.0) else: to_average.append(0.0) average_value = sum(to_average) / len(to_average) output_dict[genome_group_name][group][reaction] = average_value return output_dict def aggregate_dictionary(self, reference_dict, matrix_dict): output_dict_mean = dict() for sample, ko_abundances in matrix_dict.items(): output_dict_mean[sample] = dict() for reaction, ko_list in reference_dict.items(): abundances = list() for ko in ko_list: if ko in ko_abundances: if ko_abundances[ko]>0: abundances.append(ko_abundances[ko]) else: logging.debug("ID not found in input matrix: %s" % ko) if any(abundances): abundance_mean = sum(abundances)/len(abundances) # average of the abundances... else: abundance_mean = 0 output_dict_mean[sample][reaction] = abundance_mean return output_dict_mean def mock_metadata(self, genomes): genome_to_group = {genome:set([genome]) for genome in genomes} genome_groups = set(genomes) group_to_genome = dict(genome_to_group) # Make a copy here return genome_to_group, genome_groups, group_to_genome def network_pipeline(self, subparser_name, matrix, genome_metadata_path, transcriptome_abundances_path, transcriptome_metadata_path, metagenome_abundances, metagenome_metadata_path, metabolome, enrichment_output, depth, filter, limit, queries, output_directory): ''' Parameters ---------- matrix transcriptome_abundances_path metagenome_abundances metagenome_metadata_path metabolome enrichment_output depth filter limit queries output_directory ''' orthology_matrix, genome_names, _ = Parser.parse_simple_matrix(matrix) if genome_metadata_path: genome_to_group, genome_groups, group_to_genome = \ Parser.parse_metadata_matrix(genome_metadata_path) else: genome_to_group, genome_groups, group_to_genome = \ self.mock_metadata(genome_names) reaction_matrix = self.aggregate_dictionary(self.reaction_to_ko, orthology_matrix) # Read in fisher results if enrichment_output: logging.info('Parsing input enrichment results') fisher_results = Parser.parse_enrichment_output(enrichment_output) else: logging.info('No enrichment results provided') fisher_results = None # Read in metabolome abundances if metabolome: logging.info('Parsing metabolome abundances') abundances_metabolome = Parser.parse_simple_matrix(metabolome) else: logging.info('No metabolome abundances provided') abundances_metabolome = None # Read in genome metagenome_abundances if metagenome_abundances: logging.info('Parsing input genome abundances') sample_abundance = Parser.parse_simple_matrix(metagenome_abundances)[0] sample_metadata = Parser.parse_metadata_matrix(metagenome_metadata_path)[2] else: # FIXME : There's always a better way than faking it. logging.info('No genome abundances provided') sample_abundance = {'MOCK': {x:1 for x in list(reaction_matrix.keys())} } sample_metadata = {"abundance": ['MOCK']} median_sample_abundances = self.median_genome_abundance(sample_abundance, sample_metadata) normalised_abundance_dict = self.normalise_by_abundance(median_sample_abundances, reaction_matrix, group_to_genome, genome_to_group, genome_groups) abundances_metagenome = self.average(normalised_abundance_dict) # Read in expression (TPM) values if transcriptome_abundances_path: logging.info("Parsing detectM TPM abundances") transcriptome_metadata = Parser.parse_metadata_matrix(transcriptome_metadata_path)[2] transcriptome_abundance_dict = self.average_tpm_by_sample(Parser.parse_tpm_values(transcriptome_abundances_path), transcriptome_metadata) transcriptome_abundances = self.average_tpm_values(transcriptome_abundance_dict, group_to_genome) else: transcriptome_abundances = None network_builder = NetworkBuilder(group_to_genome, abundances_metagenome, transcriptome_abundances, abundances_metabolome, fisher_results) # Run the subcommand specified if subparser_name == self.EXPLORE: network_lines, node_metadata = network_builder.query_matrix(queries, depth) elif subparser_name == self.PATHWAY: network_lines, node_metadata = network_builder.pathway_matrix(limit, filter) # Write the outputs Writer.write(network_lines, os.path.join(output_directory, self.NETWORK_OUTPUT_FILE)) Writer.write(node_metadata, os.path.join(output_directory, self.METADATA_OUTPUT_FILE)) logging.info('Finished the %s pipeline' % subparser_name)