Esempio n. 1
0
 def uses_pipeline(self, compounds_list_path, annotation_matrix_path, metadata_path, output,
                   count):
     logging.info('Parsing input compounds list')
     compound_list = Parser.parse_single_column_text_file(compounds_list_path)
     logging.info('Parsing input annotations')
     annotations_dict, column_names, annotations = Parser.parse_simple_matrix(annotation_matrix_path)
     logging.info('Parsing input metadata')
     metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path)
     logging.info('Tallying genes that use specified compounds')
     output_lines_abundance, enrichment_tallys = self.uses(compound_list, annotations_dict, column_names, count)
     logging.info('Writing file: %s' % self.abundace)
     Writer.write(output_lines_abundance, os.path.join(output, self.abundace))
     logging.info('Calculating enrichment between groups for each compound')
     output_lines_enrichment = self.enrichment(enrichment_tallys, attribute_dict)
     logging.info('Writing file: %s' % self.enrichment)
     Writer.write(output_lines_enrichment, os.path.join(output, self.enrichment))
     logging.info('Finished the use pipeline')
Esempio n. 2
0
    def predict_pipeline(self, forester_model_directory, input_matrix_path,
                         output_directory):
        '''
        Inputs
        ------

        Outputs
        -------

        '''
        forester_model = ParseGenerate(forester_model_directory)

        logging.info('Parsing input')
        logging.info('Loading model: %s' % (forester_model.rf_model))

        logging.info('Parsing data')
        features, _, _ = Parser.parse_simple_matrix(input_matrix_path)

        sample_list = list()
        content_list = list()

        for sample, content in features.items():
            sample_list.append(sample)
            sample_content = list()

            for attribute in forester_model.attributes:

                if attribute in content:
                    sample_content.append(content[attribute])
                else:
                    sample_content.append('0')

            content_list.append(sample_content)

        logging.info('Making predictions')
        output_lines = self.make_predictions(forester_model.model, sample_list,
                                             content_list,
                                             forester_model.labels)

        Writer.write(
            output_lines,
            os.path.join(output_directory, self.predictions_output_file))
Esempio n. 3
0
    def generate_pipeline(self, input_matrix_path, groups_path, model_type,
                          testing_portion, grid_search, threads,
                          output_directory):
        '''
        Inputs
        ------

        Outputs
        -------

        '''
        logging.info('Using %f%% of the input data for testing',
                     testing_portion * 100)

        if model_type == self.regressor:
            model = RandomForestRegressor()
        elif model_type == self.classifier:
            model = RandomForestClassifier()
        else:
            raise Exception("Model type not recognised: %s" % (model_type))

        logging.info('Parsing inputs:')
        labels, _, _ = Parser.parse_metadata_matrix(groups_path)
        features, _, attribute_list = Parser.parse_simple_matrix(
            input_matrix_path)
        labels_list, features_list = self.transpose(labels, features,
                                                    attribute_list)
        labels_dict, labels_list_numeric = self.numerify(labels_list)

        logging.info("Tuning hyperparameters")
        random_forest_model, test_features, test_labels, best_params_list = self.tune(
            features_list, labels_list_numeric, testing_portion, grid_search,
            threads, model)

        logging.info('Making predictions on test data:')
        predictions = random_forest_model.predict(test_features)
        errors = abs(predictions - test_labels)

        logging.info('\t\tMean Absolute Error: %f degrees',
                     round(np.mean(errors), 2))
        accuracy = self.estimate_correctness(predictions, test_labels)

        logging.info('\t\tAccuracy: %f%%', accuracy)
        best_params_list.append(["Accuracy", str(accuracy)])

        logging.info("Generating attribute importances")
        output_attribute_importances = self.get_importances(
            random_forest_model, attribute_list)
        Writer.write(best_params_list,
                     os.path.join(output_directory, self.model_accuracy))

        logging.info("Generating model accuracy summary file")
        Writer.write(
            output_attribute_importances,
            os.path.join(output_directory, self.attribute_importances))

        logging.info("Preserving model")
        with open(os.path.join(output_directory, self.model_pickle),
                  'wb') as model_io:
            pickle.dump(random_forest_model, model_io)

        logging.info("Preserving group labels")
        with open(os.path.join(output_directory, self.labels_dict),
                  'wb') as labels_io:
            pickle.dump(labels_dict, labels_io)
Esempio n. 4
0
    def do(self, custom_modules, cutoff, aggregate, genome_and_annotation_file,
           genome_and_annotation_matrix, output_directory):
        '''

        Parameters
        ----------
        custom_modules                  - string. Path to file containing custom module definitions,
                                          consistent with KEGG module nomenclature
                                          (http://www.genome.jp/kegg/module.html)
        cutoff                          - float. Fraction of a module needed in order to be included
                                          in the output.
        genome_and_annotation_file      - string. Path to file containing genome - annotation file. This file
                                          contains two columns, the first with the genome name, the
                                          second with a annotation annotation within that genome
        genome_and_annotation_matrix    - string. Path to file containing genome - annotation matrix
        output                          - string. Path to file to output results to.

        '''

        pathway = dict()
        genome_output_lines = list()

        if custom_modules:
            logging.info('Reading in custom modules: %s' % custom_modules)
            self.update_with_custom_modules(custom_modules)

        # TODO: remove me there is a duplicated parser below
        if genome_and_annotation_file:
            genome_to_annotation_sets = Parser.parse_genome_and_annotation_file_lf(
                genome_and_annotation_file)

        elif genome_and_annotation_matrix:
            genome_to_annotation_sets = Parser.parse_genome_and_annotation_file_matrix(
                genome_and_annotation_matrix)

        if aggregate:
            logging.info('Reading in abundances: %s' %
                         (genome_and_annotation_matrix))
            abundances, _, _ = Parser.parse_simple_matrix(
                genome_and_annotation_matrix)
            abundance_result = dict()

        logging.info("Read in annotations for %i genomes" %
                     len(genome_to_annotation_sets))

        output_lines = [
            '\t'.join([
                "Genome_name", "Module_id", "Module_name", "Steps_found",
                "Steps_needed", "Percent_steps_found"
            ]) + '\n'
        ]  # "KO_found", "KO_needed", "Percent_KO_found"

        genome_output_lines = [
            '\t'.join(["Genome_name", "Module_id", "Module_name"]) + '\n'
        ]

        for name, pathway_string in self.m2def.items():

            if name not in self.signature_modules:
                path = ModuleDescription(pathway_string)
                pathway[name] = path

                for genome, annotations in genome_to_annotation_sets.items():

                    num_covered, _, _, ko_path = path.num_covered_steps(
                        annotations)
                    num_all = path.num_steps()
                    perc_covered = num_covered / float(num_all)
                    ko_path_list = list(chain(*ko_path.values()))

                    if perc_covered >= cutoff:

                        if path.is_single_step:

                            if perc_covered != 1:

                                if cutoff < 1:
                                    num_all = 1
                                    num_covered = 0
                                    perc_covered = 0.0

                                else:
                                    continue

                            else:
                                num_all = 1
                                num_covered = 1

                        if aggregate:

                            if genome not in abundance_result:
                                abundance_result[genome] = dict()

                            pathway_abundance = [
                                abundances[genome][ko] for ko in ko_path_list
                            ]
                            pathway_average_abundance = sum(
                                pathway_abundance) / len(pathway_abundance)
                            abundance_result[genome][
                                name] = pathway_average_abundance

                        genome_output_lines.append([
                            genome, name, self.m[name], ','.join(ko_path_list)
                        ])
                        output_line = [
                            genome, name, self.m[name],
                            str(num_covered),
                            str(num_all),
                            str(round(perc_covered * 100, 2))
                        ]
                        output_lines.append(output_line)

        Writer.write(output_lines,
                     os.path.join(output_directory, self.KO_OUTPUT))
        Writer.write(genome_output_lines,
                     os.path.join(output_directory, self.MODULE_PATHS))

        if aggregate:
            samples = list(abundance_result.keys())
            output_lines = ['\t'.join(["ID"] + samples) + '\n']

            for module in self.m2def.keys():

                if module not in self.signature_modules:
                    output_line = [module]

                    for sample in samples:

                        if module in abundance_result[sample]:
                            output_line.append(
                                str(abundance_result[sample][module]))

                        else:
                            output_line.append('0.0')
                    output_lines.append(output_line)

            Writer.write(output_lines,
                         os.path.join(output_directory, self.AGGREGATE_OUTPUT))
Esempio n. 5
0
    def network_pipeline(self,
           subparser_name,
           matrix, genome_metadata_path,
           transcriptome_abundances_path, transcriptome_metadata_path,
           metagenome_abundances, metagenome_metadata_path,
           metabolome,
           enrichment_output,
           depth, filter, limit, queries, output_directory):
        '''
        Parameters
        ----------
        matrix
        transcriptome_abundances_path
        metagenome_abundances
        metagenome_metadata_path
        metabolome
        enrichment_output
        depth
        filter
        limit
        queries
        output_directory
        '''
        orthology_matrix, genome_names, _ = Parser.parse_simple_matrix(matrix)
        if genome_metadata_path:
            genome_to_group, genome_groups, group_to_genome = \
                    Parser.parse_metadata_matrix(genome_metadata_path)
        else:
            genome_to_group, genome_groups, group_to_genome = \
                    self.mock_metadata(genome_names)

        reaction_matrix = self.aggregate_dictionary(self.reaction_to_ko, orthology_matrix)

        # Read in fisher results
        if enrichment_output:
            logging.info('Parsing input enrichment results')
            fisher_results = Parser.parse_enrichment_output(enrichment_output)
        else:
            logging.info('No enrichment results provided')
            fisher_results = None

        # Read in metabolome abundances
        if metabolome:
            logging.info('Parsing metabolome abundances')
            abundances_metabolome = Parser.parse_simple_matrix(metabolome)
        else:
            logging.info('No metabolome abundances provided')
            abundances_metabolome = None

        # Read in genome metagenome_abundances
        if metagenome_abundances:
            logging.info('Parsing input genome abundances')
            sample_abundance = Parser.parse_simple_matrix(metagenome_abundances)[0]
            sample_metadata = Parser.parse_metadata_matrix(metagenome_metadata_path)[2]
        else:
            # FIXME : There's always a better way than faking it.
            logging.info('No genome abundances provided')
            sample_abundance = {'MOCK': {x:1 for x in list(reaction_matrix.keys())} }
            sample_metadata = {"abundance": ['MOCK']}

        median_sample_abundances = self.median_genome_abundance(sample_abundance, sample_metadata)
        normalised_abundance_dict = self.normalise_by_abundance(median_sample_abundances, reaction_matrix, group_to_genome, genome_to_group, genome_groups)
        abundances_metagenome = self.average(normalised_abundance_dict)

        # Read in expression (TPM) values
        if transcriptome_abundances_path:
            logging.info("Parsing detectM TPM abundances")
            transcriptome_metadata = Parser.parse_metadata_matrix(transcriptome_metadata_path)[2]
            transcriptome_abundance_dict = self.average_tpm_by_sample(Parser.parse_tpm_values(transcriptome_abundances_path), transcriptome_metadata)
            transcriptome_abundances = self.average_tpm_values(transcriptome_abundance_dict, group_to_genome)
        else:
            transcriptome_abundances = None

        network_builder = NetworkBuilder(group_to_genome, abundances_metagenome,
                                         transcriptome_abundances, abundances_metabolome, 
                                         fisher_results)

        # Run the subcommand specified
        if subparser_name == self.EXPLORE:
            network_lines, node_metadata = network_builder.query_matrix(queries, depth)
        elif subparser_name == self.PATHWAY:
            network_lines, node_metadata = network_builder.pathway_matrix(limit, filter)

        # Write the outputs
        Writer.write(network_lines, os.path.join(output_directory, self.NETWORK_OUTPUT_FILE))
        Writer.write(node_metadata, os.path.join(output_directory, self.METADATA_OUTPUT_FILE))
        
        logging.info('Finished the %s pipeline' % subparser_name)
Esempio n. 6
0
    def do(# Input options
           self, annotate_output, annotation_matrix, metadata_path, abundances_path, abundance_metadata_path, transcriptome_path, transcriptome_metadata_path,
           # Runtime options
           pval_cutoff, proportions_cutoff,
           threshold, multi_test_correction, batchfile, processes, allow_negative_values,
           ko, pfam, tigrfam, cluster, ortholog, cazy, ec, ko_hmm,
           # Output options
           output_directory):

        plot  = Plot()
        database  = Databases()

        if annotate_output:
            logging.info('Parsing annotate output: %s' % (annotate_output))
            pa = ParseAnnotate(annotate_output, processes)

            if ko:
                annotation_matrix = pa.ko
            elif ko_hmm:
                annotation_matrix = pa.ko_hmm
            elif pfam:
                annotation_matrix = pa.pfam
            elif tigrfam:
                annotation_matrix = pa.tigrfam
            elif cluster:
                annotation_matrix = pa.cluster
            elif ortholog:
                annotation_matrix = pa.ortholog
            elif cazy:
                annotation_matrix = pa.cazy
            elif ec:
                annotation_matrix = pa.ec

        annotations_dict, _, annotations, = Parser.parse_simple_matrix(annotation_matrix)
        annotation_type = self.check_annotation_type(annotations)

        logging.info('Parsing metadata: %s' % metadata_path)
        metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path)

        if abundances_path:
            logging.info('Running abundances pipeline')
            logging.info('Parsing sample abundance')
            abundances_dict, _, _ = Parser.parse_simple_matrix(abundances_path)

            logging.info('Parsing sample metadata')
            _, _, ab_attribute_dict = Parser.parse_metadata_matrix(abundance_metadata_path)

            test = Test(annotations_dict, None, annotation_type, threshold, multi_test_correction, processes, database)
            weighted_abundance = self.weight_annotation_matrix(abundances_dict, annotations_dict, ab_attribute_dict, annotations)
            results = test.test_weighted_abundances(weighted_abundance, annotations)

            for result in results:
                test_result_lines, test_result_output_file = result
                test_result_output_path = os.path.join(output_directory, test_result_output_file)
                Writer.write(test_result_lines, test_result_output_path)

        else:

            if batchfile:
                gtdb_annotation_matrix = self.get_gtdb_database_path(annotation_type, database)

                batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict = Parser.parse_metadata_matrix(batchfile)
                genomes_set = set(batchfile_metadata.keys())
                reference_genome_annotations, genomes_set = Parser.filter_large_matrix(genomes_set, gtdb_annotation_matrix)

                annotations_dict.update(reference_genome_annotations)
                new_batchfile_attribute_dict = dict()

                for group_name, accession_id_list in batchfile_attribute_dict.items():
                    filtered_accession_id_list = [accession_id for accession_id in accession_id_list if accession_id in genomes_set]

                    if len(filtered_accession_id_list)>0:
                        new_batchfile_attribute_dict[group_name] = filtered_accession_id_list

                attribute_dict.update(new_batchfile_attribute_dict)
                batchfile_metadata={group_name:batchfile_metadata[group_name] for group_name in genomes_set}
                metadata.update(batchfile_metadata)
                batchfile_metadata_value_lists = set(new_batchfile_attribute_dict.keys())
                metadata_value_lists = metadata_value_lists.union(batchfile_metadata_value_lists)

            logging.info("Comparing sets of genomes")
            combination_dict = dict()

            for combination in product(*list([metadata_value_lists])):
                genome_list = list()

                for genome, attributes in metadata.items():

                    for feature in combination:

                        if feature in attributes:
                            genome_list.append(genome)

                combination_dict['_'.join(combination)] = genome_list

            test = Test(annotations_dict, combination_dict, annotation_type, threshold, multi_test_correction, processes, database)
            results = test.do(attribute_dict)

            for result in results:
                test_result_lines, test_result_output_file = result
                test_result_output_path = os.path.join(output_directory, test_result_output_file)
                Writer.write(test_result_lines, test_result_output_path)

        raw_proportions_output_lines = self.calculate_portions(annotations, combination_dict, annotations_dict, genome_list, proportions_cutoff)
        Writer.write(raw_proportions_output_lines, os.path.join(output_directory, self.PROPORTIONS))

        logging.info('Generating summary plots')

        if annotation_type==self.KEGG:
            logging.info('Finding module completeness in differentially abundant KOs')

            for result_file in os.listdir(output_directory):

                if(result_file.endswith("fisher.tsv") or result_file.endswith("cdf.tsv")):
                    plot.draw_barplots(os.path.join(output_directory, result_file), pval_cutoff, output_directory)
                    module_output, prefix = self.module_completeness(database, os.path.join(output_directory, result_file), pval_cutoff)
                    Writer.write(module_output, os.path.join(output_directory, prefix +'_'+ self.MODULE_COMPLETENESS))

        plot.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
Esempio n. 7
0
    def classify_pipeline(self, custom_modules, cutoff, aggregate,
                          genome_and_annotation_matrix, module_rules_json,
                          gff_file, output_directory):
        '''

        Parameters
        ----------
        custom_modules                  - string. Path to file containing custom module definitions,
                                          consistent with KEGG module nomenclature
                                          (http://www.genome.jp/kegg/module.html)
        cutoff                          - float. Fraction of a module needed in order to be included
                                          in the output.
        genome_and_annotation_matrix    - string. Path to file containing genome - annotation matrix
        output                          - string. Path to file to output results to.
        '''
        pathway = dict()
        genome_output_lines = list()

        if module_rules_json:
            cc = ClassifyChecks(RulesJson().load(module_rules_json))

        if custom_modules:
            logging.info(f'Reading in custom modules: {custom_modules}')
            modules_to_classify = self.update_with_custom_modules(
                custom_modules)
        else:
            modules_to_classify = self.m2def

        if gff_file:
            logging.info("Reading in annotations from an input GFF file")
            genome_to_annotation_sets = dict()
            annotation_results = dict()
            annotation_results, genome_to_annotation_sets = Parser.parse_gff(
                gff_file)
        else:
            logging.info("Reading in annotations from an input matrix")
            genome_to_annotation_sets, _, _ = Parser.parse_simple_matrix(
                genome_and_annotation_matrix)

        if aggregate:
            logging.info(
                f'Reading in abundances: {genome_and_annotation_matrix}')
            abundances, _, _ = Parser.parse_simple_matrix(
                genome_and_annotation_matrix)
            abundance_result = dict()

        logging.info(
            f"Read in annotations for {len(genome_to_annotation_sets)} genomes"
        )

        output_lines = [[
            "Genome_name", "Module_id", "Module_name", "Steps_found",
            "Steps_needed", "Percent_steps_found"
        ]]

        genome_output_lines = [["Genome_name", "Module_id", "Module_name"]]

        for name, pathway_string in modules_to_classify.items():

            if name not in self.signature_modules:
                path = ModuleDescription(pathway_string)
                pathway[name] = path

                for genome, annotation_frequency in genome_to_annotation_sets.items(
                ):
                    annotations = get_present_annotations(annotation_frequency)
                    num_covered, _, _, ko_path = path.num_covered_steps(
                        annotations)
                    num_all = path.num_steps()

                    perc_covered = num_covered / float(num_all)

                    ko_path_list = list(chain(*ko_path.values()))

                    if perc_covered >= cutoff:

                        if module_rules_json:
                            rule_check_result = cc.check(
                                name, annotation_results[genome])
                        else:
                            rule_check_result = True

                        if rule_check_result:

                            if path.is_single_step:

                                if perc_covered != 1:

                                    if cutoff < 1:
                                        num_all = 1
                                        num_covered = 0
                                        perc_covered = 0.0

                                    else:
                                        continue

                                else:
                                    num_all = 1
                                    num_covered = 1

                            if aggregate:

                                if genome not in abundance_result:
                                    abundance_result[genome] = dict()

                                pathway_abundance = [
                                    abundances[genome][ko]
                                    for ko in ko_path_list
                                ]

                                if len(pathway_abundance) > 0:
                                    pathway_average_abundance = sum(
                                        pathway_abundance) / len(
                                            pathway_abundance)
                                else:
                                    pathway_average_abundance = 0
                                abundance_result[genome][
                                    name] = pathway_average_abundance

                            genome_output_lines.append([
                                genome, name, self.modules[name],
                                ','.join(ko_path_list)
                            ])
                            output_line = [
                                genome, name, self.modules[name],
                                str(num_covered),
                                str(num_all),
                                str(round(perc_covered * 100, 2))
                            ]
                            output_lines.append(output_line)

        Writer.write(output_lines,
                     os.path.join(output_directory, self.ko_output))
        Writer.write(genome_output_lines,
                     os.path.join(output_directory, self.module_paths))

        if aggregate:
            samples = list(abundance_result.keys())
            output_lines = ['\t'.join(["ID"] + samples) + '\n']

            for module in modules_to_classify.keys():

                if module not in self.signature_modules:
                    output_line = [module]

                    for sample in samples:

                        if module in abundance_result[sample]:
                            output_line.append(
                                str(abundance_result[sample][module]))

                        else:
                            output_line.append('0.0')
                    output_lines.append(output_line)

            Writer.write(output_lines,
                         os.path.join(output_directory, self.aggregate_output))