Exemple #1
0
 def __init__(self):
     databases = Databases()
     self.signature_modules = databases.signature_modules
     self.m2def = databases.m2def()
     self.modules = databases.m()
     self.ko_output = "module_completeness.tsv"
     self.module_paths = "module_paths.tsv"
     self.aggregate_output = "aggregate_output.tsv"
Exemple #2
0
    def __init__(self, output_directory, annotate_ko, annotate_ko_hmm,
                 annotate_pfam, annotate_tigrfam, annoatate_cluster,
                 annotate_ortholog, annotate_cazy, annotate_ec,
                 annotate_orthogroup, evalue, bit, percent_id_cutoff,
                 aln_query, aln_reference, fraction_aligned, cut_ga_pfam,
                 cut_nc_pfam, cut_tc_pfam, cut_ga_tigrfam, cut_nc_tigrfam,
                 cut_tc_tigrfam, cut_hmm, inflation, chunk_number, chunk_max,
                 count_domains, threads, parallel, suffix, light):

        # Define inputs and outputs
        self.output_directory = output_directory

        # Define type of annotation to be carried out
        self.annotate_ko = annotate_ko
        self.annotate_ko_hmm = annotate_ko_hmm
        self.annotate_pfam = annotate_pfam
        self.annotate_tigrfam = annotate_tigrfam
        self.annotate_cluster = annoatate_cluster
        self.annotate_ortholog = annotate_ortholog
        self.annotate_orthogroup = annotate_orthogroup
        self.annotate_cazy = annotate_cazy
        self.annotate_ec = annotate_ec

        # Cutoffs
        self.evalue = evalue
        self.bit = bit
        self.percent_id_cutoff = percent_id_cutoff
        self.aln_query = aln_query
        self.aln_reference = aln_reference
        self.fraction_aligned = fraction_aligned
        self.cut_ga_pfam = cut_ga_pfam
        self.cut_nc_pfam = cut_nc_pfam
        self.cut_tc_pfam = cut_tc_pfam
        self.cut_ga_tigrfam = cut_ga_tigrfam
        self.cut_nc_tigrfam = cut_nc_tigrfam
        self.cut_tc_tigrfam = cut_tc_tigrfam
        self.cut_hmm = cut_hmm
        self.inflation = inflation
        self.chunk_number = chunk_number
        self.chunk_max = chunk_max
        self.count_domains = count_domains

        # Parameters
        self.threads = threads
        self.parallel = parallel
        self.suffix = suffix
        self.light = light

        # Set up multiprocesses pool
        self.pool = mp.Pool(processes=int(self.parallel))

        # Load databases
        self.databases = Databases()
Exemple #3
0
    def __init__(self):

        d=Databases()
        self.ko_re              = re.compile('^K\d+$')
        self.signature_modules  = d.signature_modules
        self.m2def              = d.m2def
        self.m                  = d.m
Exemple #4
0
    def __init__(self, annotation_type, clusters = None):
        '''
        Interpret which annotation type to write a matrix for.

        Parameters
        ----------
        annotation_type - String.
        '''
        self.annotation_type = annotation_type
        self.databases = Databases()
        if self.annotation_type == self.KO:
            self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.KO))]

        elif self.annotation_type == self.EC:
            self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.EC))]

        elif self.annotation_type == self.PFAM:
            self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.PFAM))]

        elif self.annotation_type == self.TIGRFAM:
            self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.TIGRFAM))]

        elif self.annotation_type == self.CAZY:
            self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.CAZY))]

        elif self.annotation_type == self.HYPOTHETICAL:
            self.annotation_list = clusters

        elif self.annotation_type == self.ORTHOLOG:
            self.annotation_list = clusters

        else:
            raise Exception("Annotation type not found: %s" % (self.annotation_type))
Exemple #5
0
    def __init__(self, metadata_keys):
        
        self.d=Databases()

        self.metadata_keys \
                        = list(metadata_keys)
        self.matrix_header \
                        = ["compound", "reaction", 'type'] 
        self.transcriptome_header \
                        = [key + '_reaction_transcriptome' 
                           for key in self.metadata_keys] + \
                          [key + '_reaction_expression' 
                           for key in self.metadata_keys]
        self.compound_header \
                        = [key + '_compound' 
                           for key in self.metadata_keys]
        self.metadata_header \
                        = ['node', 
                           'description',
                           'type',
                           'module', 
                           'module_descr',
                           'pathway', 
                           'pathway_descr', 
                           'node_type']
        self.query_header \
                        =  ['query',
                            'step']
        self.compound_reaction_index_header \
                        = []
        self.step_header = ['step']
Exemple #6
0
    def __init__(self, matrix, transcriptome):
        d = Databases()
        self.r2k = d.r2k
        logging.info("Parsing input matrix: %s" % matrix)
        self.orthology_matrix \
                 = self._parse_matrix(matrix)
        logging.info("Calculating reaction abundances")
        self.reaction_matrix  \
                 = self._calculate_abundances(self.r2k, self.orthology_matrix)
        if transcriptome:
            logging.info("Parsing input transcriptome: %s" % transcriptome)
            self.orthology_matrix_transcriptome \
                        = self._parse_matrix(transcriptome)

            logging.info("Calculating reaction transcriptome abundances")
            self.reaction_matrix_transcriptome \
                 = self._calculate_abundances(self.r2k,
                                              self.orthology_matrix_transcriptome)

            logging.info("Calculating normalized expression abundances")
            self.orthology_matrix_expression \
                = self._calculate_expression_matrix(self.orthology_matrix,
                                        self.orthology_matrix_transcriptome)

            logging.info("Calculating reaction expression abundances")
            self.reaction_matrix_expression  \
                 = self._calculate_abundances(self.r2k,
                                              self.orthology_matrix_expression)
Exemple #7
0
    def __init__(self):
        
        databases = Databases()
        
        self.reaction_to_ko = databases.r2k()
        self.compound_to_reaction = databases.c2r()
        self.compounds = databases.c()

        self.positive = 'positive'
        self.negative = 'negative'

        self.abundace = "frequency_matrix.tsv"
        self.enrichment = "enrichment_results.tsv"
        self.abundace_header = ["Compound"]
        self.enrichment_header = ["Compound", "Group_1", "Group_2", "group_1_mean", "group_2_mean",
                                  "score", "pvalue", "description"]
Exemple #8
0
 def __init__(self):
     self.databases = Databases()
     path_to_scripts = os.path.split(os.path.realpath(__file__))[0]
     self.draw_pca_script_path = os.path.join(path_to_scripts,
                                              "PLOT_ko_pca.r")
     self.draw_heatmap_script_path = os.path.join(path_to_scripts,
                                                  "PLOT_ko_heatmap.r")
     self.draw_barplots_script_path = os.path.join(path_to_scripts,
                                                   "PLOT_ko_breakdown.r")
     self.ko00000 = os.path.join(Data.DATABASE_DIR,
                                 self.databases.DB_VERSION, 'ko00000.tsv')
     self.output_pca_plot = 'presence_absence_pca_plot.svg'
     self.output_heatmap_plot = 'presence_absence_pca_plot.svg'
Exemple #9
0
    def __init__(self):
        d = Databases()

        self.m2def = d.m2def()
        self.m2c = d.m2c()
        self.c = d.c()
        self.m = d.m()
        self.c2m = d.c2m()


        self.signature_modules = d.signature_modules
        self.output_file = 'linkages.tsv'
Exemple #10
0
    def __init__(self):
        d = Databases()

        self.m2def = d.m2def
        self.m2c = d.m2c
        self.c2m = dict()

        for module, compounds in self.m2c.items():
            substrates = compounds[0]
            for substrate in substrates:
                if substrate in self.c2m:
                    self.c2m[substrate].append(module)
                else:
                    self.c2m[substrate] = [module]

        self.c = d.c
        self.m = d.m
        self.signature_modules = d.signature_modules
        self.output_file = 'linkages.tsv'
Exemple #11
0
    def __init__(self, output_directory, ko, pfam, tigrfam, hypothetical, cazy,
                 ec, evalue, bit, id, aln_query, aln_reference, c, cut_ga,
                 cut_nc, cut_tc, inflation, chunk_number, chunk_max,
                 count_domains, threads, parallel, suffix, light):

        # Define inputs and outputs
        self.output_directory = output_directory

        # Define type of annotation to be carried out
        self.ko = ko
        self.pfam = pfam
        self.tigrfam = tigrfam
        self.hypothetical = hypothetical
        self.cazy = cazy
        self.ec = ec

        # Cutoffs
        self.evalue = evalue
        self.bit = bit
        self.id = id
        self.aln_query = aln_query
        self.aln_reference = aln_reference
        self.c = c
        self.cut_ga = cut_ga
        self.cut_nc = cut_nc
        self.cut_tc = cut_tc
        self.inflation = inflation
        self.chunk_number = chunk_number
        self.chunk_max = chunk_max
        self.count_domains = count_domains

        # Parameters
        self.threads = threads
        self.parallel = parallel
        self.suffix = suffix
        self.light = light

        # Set up multiprocesses pool
        self.pool = mp.Pool(processes=int(self.parallel))

        # Load databases
        self.databases = Databases()
Exemple #12
0
    def parse_tpm_values(tpm_values):

        from enrichm.databases import Databases

        k2r = Databases().k2r()

        output_dict = dict()
        annotation_types = set()
        genome_types = set()

        tpm_values_io = open(tpm_values, 'rb')
        tpm_values_io.readline()

        for line in tpm_values_io:
            gene, _, _, _, _, _, _, _, _, _, tpm, \
            _, _, annotation, sample = line.strip().split(b'\t')
            annotation_list = annotation.split(b',')
            tpm = float(tpm)
            genome = '_'.join(str(gene, "utf-8").split('_')[:2]) # temporary
            genome_types.add(genome)

            if sample not in output_dict:
                output_dict[sample] = dict()

            if genome not in output_dict[sample]:
                output_dict[sample][genome] = dict()

            for annotation_type in annotation_list:

                if str(annotation_type, "utf-8") in k2r:
                    reactions = k2r[str(annotation_type, "utf-8")]

                    for reaction in reactions:
                        reaction = str.encode(reaction)

                        if reaction not in output_dict[sample][genome]:
                            output_dict[sample][genome][reaction] = 0.0
                            annotation_types.add(reaction)

                        output_dict[sample][genome][reaction] += tpm
        return output_dict, annotation_types, genome_types
Exemple #13
0
    def __init__(self, metadata, abundances_metagenome,
                 abundances_transcriptome, abundances_metabolome,
                 fisher_results):
        self.abundances_metagenome = abundances_metagenome
        self.abundances_transcriptome = abundances_transcriptome
        self.abundances_metabolome = abundances_metabolome
        self.fisher_results = fisher_results
        self.metadata_keys = list(metadata.keys())

        databases = Databases()

        self.reaction_to_module = databases.r2m()
        self.module_to_reaction = databases.m2r()
        self.module_descriptions = databases.m()
        self.reaction_to_pathway = databases.r2p()
        self.pathway_to_reaction = databases.p2r()
        self.pathway_descriptions = databases.p()
        self.compound_desc_dict = databases.compound_desc_dict()
        self.compound_descriptions = databases.c()
        self.reaction_descriptions = databases.r()
        self.reactions_to_compounds = databases.r2c()
        self.reactions_to_kos = databases.r2k()

        self.matrix_header = ["compound", "reaction", 'type']
        self.transcriptome_header = [
            key + '_reaction_transcriptome' for key in self.metadata_keys
        ]
        self.compound_header = [
            key + '_compound' for key in self.metadata_keys
        ]
        self.metadata_header = [
            'node', 'description', 'type', 'module', 'module_descr', 'pathway',
            'pathway_descr', 'node_type'
        ]
        self.query_header = ['query', 'step']
        self.step_header = ['step']
        self.to_omit = set([
            "C00828",  # Menaquinone
            "C00534",  # Pyridoxamine
            "C00006",  # NADP+
            "C00003",  # NAD+
            "C00002",  # ATP
            "C00314",  # Pyridoxine
            "C00864",  # Pantothenate
            "C00504",  # Folate
            "C00032",  # Heme
            "C05443",  # Vitamin D3
            "C00253",  # Nicotinate
            "C00250",  # Pyridoxal
            "C11378",  # Ubiquinone-10
            "C05777",  # Coenzyme F430
            "C00072",  # Ascorbate
            "C00378",  # Thiamine
            "C00101",  # Tetrahydrofolate
            "C00029",  # UDP-glucose
            "C00068",  # Thiamin diphosphate
            "C00061",  # FMN
            "C00063",  # CTP
            "C05776",  # Vitamin B12
            "C00113",  # PQQ
            "C18237",  # Molybdoenzyme molybdenum cofactor
            "C00051",  # Glutathione
            "C00010",  # CoA
            "C00016",  # FAD
            "C00018",  # Pyridoxal phosphate
            "C00019",  # S-Adenosyl-L-methionine
            "C00153",  # Nicotinamide
            "C04628",  # Coenzyme B
            "C00862",  # Methanofuran
            "C15672",  # Heme O
            "C15670",  # Heme A
            "C02059",  # Phylloquinone
            "C03576",  # Coenzyme M
            "C05441",  # Vitamin D2
            "C00272",  # Tetrahydrobiopterin
            "C02477",  # alpha-Tocopherol
            "C00473",  # Retinol
            "C00120",  # Biotin
            "C00725",  # Lipoate
            "C00053",  # 3'-Phosphoadenylyl sulfate
            "C00194",  # Cobamide coenzyme
            "C00255",  # Riboflavin
            'C00001',  # H2O
            'C00008',  # ADP
            'C00013',  # Diphosphate
            'C00004',  # NADH
            'C00005',  # NADPH
            'C00080',  # H+
            'C00009',  # Orthophosphate
            'C00008',  # ADP
            'C00004',  # NADH
            'C00020',  # AMP
            'C00007',  # Oxygen
            'C00015'
        ])  # UDP
Exemple #14
0
class Tests(unittest.TestCase):

    genome_annotation_simple_example = {
        "genome_1": {
            "K00001": 1,
            "K00002": 2,
            "K00003": 0
        },
        "genome_2": {
            "K00001": 0,
            "K00002": 0,
            "K00003": 1
        },
        "genome_3": {
            "K00001": 5,
            "K00002": 4,
            "K00003": 5
        }
    }
    genome_groups_simple_example = {
        "group_1": ["genome_1"],
        "group_2": ["genome_2", "genome_3"]
    }
    simple_test_object = Test(genome_annotation_simple_example,
                              genome_groups_simple_example, "kegg", 0.1,
                              'fdr_bh', 1, Databases())
    sample_to_annotation = {
        'sample_group_1': {
            'K00001': [16.0, 25.5, 30.1],
            'K00002': [14.0, 21.0, 24.2],
            'K00003': [15.5, 26.2, 31.1]
        },
        'sample_group_2': {
            'K00001': [6.0, 6.5, 7.0],
            'K00002': [10.8, 12.4, 14.0],
            'K00003': [6.2, 5.4, 5.0]
        }
    }
    annotations = ["K00001", "K00002", "K00003"]

    def test_test_chooser(self):
        groups_1 = [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
        groups_2 = [[1], [1, 2, 3, 4, 5]]

        test_instance_1 = self.simple_test_object.test_chooser(groups_1)
        test_instance_2 = self.simple_test_object.test_chooser(groups_2)

        self.assertEqual(test_instance_1[0], stats.fisher_exact)
        self.assertEqual(test_instance_1[1], stats.mannwhitneyu)
        self.assertEqual(test_instance_2[0], self.simple_test_object.PA)
        self.assertEqual(test_instance_2[1], stats.norm.cdf)

    def test_count(self):
        '''
        test both frequency and presence absence counting in Test.
        '''

        self.assertEqual(
            self.simple_test_object.count("K00001", "group_1", False), (1, 0))
        self.assertEqual(
            self.simple_test_object.count("K00001", "group_1", True), ([1], 0))
        self.assertEqual(
            self.simple_test_object.count("K00001", "group_2", False), (1, 1))
        self.assertEqual(
            self.simple_test_object.count("K00001", "group_2", True),
            ([0, 5], 0))
        self.assertEqual(
            self.simple_test_object.count("K00003", "group_2", False), (2, 0))
        self.assertEqual(
            self.simple_test_object.count("K00003", "group_2", True),
            ([1, 5], 0))

    def test_gene_frequencies(self):
        expect_1 = [['K00002', 'group_1', 'group_2', [[2], 0], [[0, 4], 0]],
                    ['K00003', 'group_1', 'group_2', [[0], 0], [[1, 5], 0]],
                    ['K00001', 'group_1', 'group_2', [[1], 0], [[0, 5], 0]]]
        expect_2 = [['K00002', 'group_1', 'group_2', [1, 0], [1, 1]],
                    ['K00003', 'group_1', 'group_2', [0, 1], [2, 0]],
                    ['K00001', 'group_1', 'group_2', [1, 0], [1, 1]]]
        for result in self.simple_test_object.gene_frequencies(
                "group_1", "group_2", True):
            if result in expect_1:
                expect_1.pop(expect_1.index(result))
        self.assertEqual(expect_1, list())

        for result in self.simple_test_object.gene_frequencies(
                "group_1", "group_2", False):
            if result in expect_2:
                expect_2.pop(expect_2.index(result))
        self.assertEqual(expect_2, list())

    def test_test_weighted_abundances(self):
        expect = [[[
            [
                'annotation', 'group_1', 'group_2', 'enriched_in',
                'group_1_mean', 'group_2_mean', 'score', 'pvalue',
                'corrected_pvalue', 'description'
            ],
            [
                'K00001', 'sample_group_1', 'sample_group_2', 'sample_group_1',
                '23.866666666666664', '6.5', 0.0, 0.04042779918502612,
                '0.060591636418731595',
                'E1.1.1.1, adh; alcohol dehydrogenase [EC:1.1.1.1]'
            ],
            [
                'K00002', 'sample_group_1', 'sample_group_2', 'sample_group_1',
                '19.733333333333334', '12.4', 0.5, 0.060591636418731595,
                '0.060591636418731595',
                'AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]'
            ],
            [
                'K00003', 'sample_group_1', 'sample_group_2', 'sample_group_1',
                '24.26666666666667', '5.533333333333334', 0.0,
                0.04042779918502612, '0.060591636418731595',
                'hom; homoserine dehydrogenase [EC:1.1.1.3]'
            ]
        ], 'sample_group_1_vs_sample_group_2_gvg_results.mannwhitneyu.tsv']]

        result = self.simple_test_object.test_weighted_abundances(
            self.sample_to_annotation, self.annotations)

        self.assertEqual(expect, result)
Exemple #15
0
    def do(# Input options
           self, annotate_output, annotation_matrix, metadata_path, abundances_path, abundance_metadata_path, transcriptome_path, transcriptome_metadata_path,
           # Runtime options
           pval_cutoff, proportions_cutoff,
           threshold, multi_test_correction, batchfile, processes, allow_negative_values,
           ko, pfam, tigrfam, cluster, ortholog, cazy, ec, ko_hmm,
           # Output options
           output_directory):

        plot  = Plot()
        database  = Databases()

        if annotate_output:
            logging.info('Parsing annotate output: %s' % (annotate_output))
            pa = ParseAnnotate(annotate_output, processes)

            if ko:
                annotation_matrix = pa.ko
            elif ko_hmm:
                annotation_matrix = pa.ko_hmm
            elif pfam:
                annotation_matrix = pa.pfam
            elif tigrfam:
                annotation_matrix = pa.tigrfam
            elif cluster:
                annotation_matrix = pa.cluster
            elif ortholog:
                annotation_matrix = pa.ortholog
            elif cazy:
                annotation_matrix = pa.cazy
            elif ec:
                annotation_matrix = pa.ec

        annotations_dict, _, annotations, = Parser.parse_simple_matrix(annotation_matrix)
        annotation_type = self.check_annotation_type(annotations)

        logging.info('Parsing metadata: %s' % metadata_path)
        metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path)

        if abundances_path:
            logging.info('Running abundances pipeline')
            logging.info('Parsing sample abundance')
            abundances_dict, _, _ = Parser.parse_simple_matrix(abundances_path)

            logging.info('Parsing sample metadata')
            _, _, ab_attribute_dict = Parser.parse_metadata_matrix(abundance_metadata_path)

            test = Test(annotations_dict, None, annotation_type, threshold, multi_test_correction, processes, database)
            weighted_abundance = self.weight_annotation_matrix(abundances_dict, annotations_dict, ab_attribute_dict, annotations)
            results = test.test_weighted_abundances(weighted_abundance, annotations)

            for result in results:
                test_result_lines, test_result_output_file = result
                test_result_output_path = os.path.join(output_directory, test_result_output_file)
                Writer.write(test_result_lines, test_result_output_path)

        else:

            if batchfile:
                gtdb_annotation_matrix = self.get_gtdb_database_path(annotation_type, database)

                batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict = Parser.parse_metadata_matrix(batchfile)
                genomes_set = set(batchfile_metadata.keys())
                reference_genome_annotations, genomes_set = Parser.filter_large_matrix(genomes_set, gtdb_annotation_matrix)

                annotations_dict.update(reference_genome_annotations)
                new_batchfile_attribute_dict = dict()

                for group_name, accession_id_list in batchfile_attribute_dict.items():
                    filtered_accession_id_list = [accession_id for accession_id in accession_id_list if accession_id in genomes_set]

                    if len(filtered_accession_id_list)>0:
                        new_batchfile_attribute_dict[group_name] = filtered_accession_id_list

                attribute_dict.update(new_batchfile_attribute_dict)
                batchfile_metadata={group_name:batchfile_metadata[group_name] for group_name in genomes_set}
                metadata.update(batchfile_metadata)
                batchfile_metadata_value_lists = set(new_batchfile_attribute_dict.keys())
                metadata_value_lists = metadata_value_lists.union(batchfile_metadata_value_lists)

            logging.info("Comparing sets of genomes")
            combination_dict = dict()

            for combination in product(*list([metadata_value_lists])):
                genome_list = list()

                for genome, attributes in metadata.items():

                    for feature in combination:

                        if feature in attributes:
                            genome_list.append(genome)

                combination_dict['_'.join(combination)] = genome_list

            test = Test(annotations_dict, combination_dict, annotation_type, threshold, multi_test_correction, processes, database)
            results = test.do(attribute_dict)

            for result in results:
                test_result_lines, test_result_output_file = result
                test_result_output_path = os.path.join(output_directory, test_result_output_file)
                Writer.write(test_result_lines, test_result_output_path)

        raw_proportions_output_lines = self.calculate_portions(annotations, combination_dict, annotations_dict, genome_list, proportions_cutoff)
        Writer.write(raw_proportions_output_lines, os.path.join(output_directory, self.PROPORTIONS))

        logging.info('Generating summary plots')

        if annotation_type==self.KEGG:
            logging.info('Finding module completeness in differentially abundant KOs')

            for result_file in os.listdir(output_directory):

                if(result_file.endswith("fisher.tsv") or result_file.endswith("cdf.tsv")):
                    plot.draw_barplots(os.path.join(output_directory, result_file), pval_cutoff, output_directory)
                    module_output, prefix = self.module_completeness(database, os.path.join(output_directory, result_file), pval_cutoff)
                    Writer.write(module_output, os.path.join(output_directory, prefix +'_'+ self.MODULE_COMPLETENESS))

        plot.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
Exemple #16
0
 def __init__(self):
     databases = Databases()
     self.signature_modules = databases.signature_modules
     self.m2def = databases.m2def()
     self.m = databases.m()
Exemple #17
0
    def do(  # Input options
            self,
            annotate_output,
            metadata_path,
            input_modules,
            abundances,
            # Runtime options
            genomes_to_compare_with_group_file,
            pval_cutoff,
            proportions_cutoff,
            threshold,
            multi_test_correction,
            batchfile,
            processes,
            ko,
            pfam,
            tigrfam,
            hypothetical,
            cazy,
            ec,
            # Output options
            output_directory):

        p = Plot()
        c = Compare()
        d = Databases()

        if genomes_to_compare_with_group_file:
            self.genomes_to_compare_with_group = self.parse_genomes_to_compare(
                genomes_to_compare_with_group_file)
        else:
            self.genomes_to_compare_with_group = None

        logging.info('Parsing annotate output: %s' % (annotate_output))
        pa = ParseAnnotate(annotate_output, processes)

        logging.info('Parsing annotations')
        if ko:
            annotation_matrix = pa.ko
            gtdb_annotation_matrix = d.GTDB_KO
        elif pfam:
            annotation_matrix = pa.pfam
            gtdb_annotation_matrix = d.GTDB_PFAM
        elif tigrfam:
            annotation_matrix = pa.tigrfam
            gtdb_annotation_matrix = d.GTDB_TIGRFAM
        elif hypothetical:
            annotation_matrix = pa.hypothetical_cluster
            gtdb_annotation_matrix = None
        elif cazy:
            annotation_matrix = pa.cazy
            gtdb_annotation_matrix = d.GTDB_CAZY
        elif ec:
            annotation_matrix = pa.ec
            gtdb_annotation_matrix = d.GTDB_EC

        annotations_dict, modules, genomes \
                    = self._parse_annotation_matrix(annotation_matrix)

        if input_modules:
            logging.info('Limiting to %i modules' % len(modules))
            modules = input_modules

        logging.info('Parsing metadata')
        metadata, metadata_value_lists, attribute_dict \
                    = self.parse_metadata_matrix(metadata_path)

        if batchfile:
            genomes_set = set()
            batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict \
                        = self.parse_metadata_matrix(batchfile)
            genomes_set = genomes_set.union(set(batchfile_metadata.keys()))
            reference_genome_annotations, genomes_set = self.parse_gtdb_matrix(
                genomes_set, gtdb_annotation_matrix)

            annotations_dict.update(reference_genome_annotations)
            new_batchfile_attribute_dict = dict()
            for x, y in batchfile_attribute_dict.items():
                s = [z for z in y if z in genomes_set]
                if len(s) > 0:
                    new_batchfile_attribute_dict[x] = s
            attribute_dict.update(new_batchfile_attribute_dict)
            batchfile_metadata = {
                x: batchfile_metadata[x]
                for x in genomes_set
            }
            metadata.update(batchfile_metadata)
            batchfile_metadata_value_lists = set(
                new_batchfile_attribute_dict.keys())
            metadata_value_lists = metadata_value_lists.union(
                batchfile_metadata_value_lists)

        logging.info("Comparing sets of genomes")
        combination_dict = dict()
        for combination in product(*list([metadata_value_lists])):
            genome_list = list()
            for genome, attributes in metadata.items():
                for feature in combination:
                    if feature in attributes:
                        genome_list.append(genome)
            combination_dict['_'.join(combination)] = genome_list

        annotation_type = self.check_annotation_type(modules)

        t = Test(annotations_dict, modules, genomes, combination_dict,
                 annotation_type, threshold, multi_test_correction,
                 pval_cutoff, processes, d)
        results = t.do(attribute_dict)

        for result in results:
            test_result_lines, test_result_output_file = result

            test_result_output_path = os.path.join(output_directory,
                                                   test_result_output_file)
            self._write(test_result_lines, test_result_output_path)

        raw_portions_path \
            = os.path.join(output_directory, self.PROPORTIONS)
        unique_to_groups_path \
            = os.path.join(output_directory, self.UNIQUE_TO_GROUPS)
        raw_proportions_output_lines \
            = self.calculate_portions(modules, combination_dict, annotations_dict, genome_list, proportions_cutoff)

        self._write(raw_proportions_output_lines, raw_portions_path)

        logging.info('Generating summary plots')

        if annotation_type == self.KEGG:
            logging.info(
                'Finding module completeness in differentially abundant KOs')

            for result_file in os.listdir(output_directory):

                if (result_file.endswith("fisher.tsv")
                        or result_file.endswith("cdf.tsv")):
                    p.draw_barplots(
                        os.path.join(output_directory, result_file),
                        pval_cutoff, output_directory)

                    g1_sig_kos = set()
                    g2_sig_kos = set()

                    result_file_io = open(
                        os.path.join(output_directory, result_file))
                    header = result_file_io.readline()
                    for line in result_file_io:
                        sline = line.strip().split('\t')
                        if float(sline[-2]) < pval_cutoff:
                            if result_file.endswith("fisher.tsv"):
                                g1 = float(
                                    sline[3]) / (int(sline[3]) + int(sline[4]))
                                g2 = float(
                                    sline[5]) / (int(sline[5]) + int(sline[6]))
                            elif result_file.endswith("cdf.tsv"):
                                g1 = float(sline[3])
                                g2 = float(sline[5])
                            if g1 > g2:
                                g1_sig_kos.add(sline[0])
                            else:
                                g2_sig_kos.add(sline[0])

                    module_output = [[
                        "Module", "Lineage", "Total steps", "Steps covered",
                        "Percentage covered", "Module description"
                    ]]
                    for module, definition in d.m2def.items():
                        if module not in d.signature_modules:
                            pathway = ModuleDescription(definition)
                            num_all = pathway.num_steps()
                            g1_num_covered, g1_ko_covered, g1_ko_total, g1_ko_path = pathway.num_covered_steps(
                                g1_sig_kos)
                            g1_perc_covered = g1_num_covered / float(num_all)

                            g2_num_covered, g2_ko_covered, g2_ko_total, g2_ko_path = pathway.num_covered_steps(
                                g2_sig_kos)
                            g2_perc_covered = g2_num_covered / float(num_all)
                            if g1_perc_covered > 0:
                                output_line = [
                                    module, sline[1], num_all, g1_num_covered,
                                    g1_perc_covered, d.m[module]
                                ]
                                module_output.append(output_line)
                            if g2_perc_covered > 0:
                                output_line = [
                                    module, sline[2], num_all, g2_num_covered,
                                    g2_perc_covered, d.m[module]
                                ]
                                module_output.append(output_line)

                    prefix = '_vs_'.join([sline[1],
                                          sline[2]]).replace(' ', '_')
                    self._write(
                        module_output,
                        os.path.join(output_directory,
                                     prefix + '_' + self.MODULE_COMPLETENESS))

        p.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
 def __init__(self):
     self.databases = Databases()
     self.reactions = self.databases.r()
     self.reaction_to_ko = self.databases.r2k()
Exemple #19
0
class Annotate:
    '''
    Annotates proteins, and MAGs
    '''
    GENOME_BIN = 'genome_bin'
    GENOME_PROTEINS = 'genome_proteins'
    GENOME_GENES = 'genome_genes'
    GENOME_KO = 'annotations_ko'
    GENOME_KO_HMM = 'annotations_ko_hmm'
    GENOME_EC = 'annotations_ec'
    GENOME_PFAM = 'annotations_pfam'
    GENOME_TIGRFAM = 'annotations_tigrfam'
    GENOME_HYPOTHETICAL = 'annotations_hypothetical'
    GENOME_CAZY = 'annotations_cazy'
    GENOME_GFF = 'annotations_gff'
    GENOME_OBJ = 'annotations_genomes'
    OUTPUT_KO = 'ko_frequency_table.tsv'
    OUTPUT_KO_HMM = 'ko_hmm_frequency_table.tsv'
    OUTPUT_EC = 'ec_frequency_table.tsv'
    OUTPUT_PFAM = 'pfam_frequency_table.tsv'
    OUTPUT_TIGRFAM = 'tigrfam_frequency_table.tsv'
    OUTPUT_CAZY = 'cazy_frequency_table.tsv'
    OUTPUT_CLUSTER = 'cluster_frequency_table.tsv'
    OUTPUT_ORTHOLOG = 'ortholog_frequency_table.tsv'
    OUTPUT_HYPOTHETICAL_ANNOTATIONS = 'hypothetical_annotations.tsv'
    OUTPUT_DIAMOND = "DIAMOND_search"
    GFF_SUFFIX = '.gff'
    PROTEINS_SUFFIX = '.faa'
    ANNOTATION_SUFFIX = '.tsv'
    PICKLE_SUFFIX = '.pickle'

    def __init__(self, output_directory, annotate_ko, annotate_ko_hmm,
                 annotate_pfam, annotate_tigrfam, annoatate_cluster,
                 annotate_ortholog, annotate_cazy, annotate_ec, evalue, bit,
                 percent_id_cutoff, aln_query, aln_reference, fraction_aligned,
                 cut_ga, cut_nc, cut_tc, cut_hmm, inflation, chunk_number,
                 chunk_max, count_domains, threads, parallel, suffix, light):

        # Define inputs and outputs
        self.output_directory = output_directory

        # Define type of annotation to be carried out
        self.annotate_ko = annotate_ko
        self.annotate_ko_hmm = annotate_ko_hmm
        self.annotate_pfam = annotate_pfam
        self.annotate_tigrfam = annotate_tigrfam
        self.annotate_cluster = annoatate_cluster
        self.annotate_ortholog = annotate_ortholog
        self.annotate_cazy = annotate_cazy
        self.annotate_ec = annotate_ec

        # Cutoffs
        self.evalue = evalue
        self.bit = bit
        self.percent_id_cutoff = percent_id_cutoff
        self.aln_query = aln_query
        self.aln_reference = aln_reference
        self.fraction_aligned = fraction_aligned
        self.cut_ga = cut_ga
        self.cut_nc = cut_nc
        self.cut_tc = cut_tc
        self.cut_hmm = cut_hmm
        self.inflation = inflation
        self.chunk_number = chunk_number
        self.chunk_max = chunk_max
        self.count_domains = count_domains

        # Parameters
        self.threads = threads
        self.parallel = parallel
        self.suffix = suffix
        self.light = light

        # Set up multiprocesses pool
        self.pool = mp.Pool(processes=int(self.parallel))

        # Load databases
        self.databases = Databases()

    def prep_genome(self, genome_file_list, genome_directory):
        '''
        Do any preparation specific to the genome annotation pipeline.

        Inputs
        ------
        genome_file_list - List. list of strings, each a path to a file
        containing a genome

        Outputs
        -------
        returns the directory with all genome ids sym-linked into it.
        '''
        # link all the genomes into one file
        logging.info('Preparing genomes for annotation')

        if genome_file_list:
            mkdir(genome_directory)
            genome_paths = list()

            for genome_path in genome_file_list:

                if genome_path.endswith(self.suffix):
                    genome_paths.append(genome_path)

            cmd = "xargs --arg-file=/dev/stdin cp --target-directory=%s" % genome_directory
            logging.debug(cmd)
            process = subprocess.Popen(["bash", "-c", cmd],
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE,
                                       universal_newlines=True)
            process.communicate(input=str('\n'.join(genome_paths)))

        return genome_directory

    def call_proteins(self, genome_directory):
        '''
        Use prodigal to call proteins within the genomes

        Parameters
        ----------
        genome_directory  - string. Directory containing .fna files for each
                            input genome

        Outputs
        -------
        returns the directory containing an .faa file for each input genomes
        '''
        protein_directory_path = path.join(self.output_directory,
                                           self.GENOME_PROTEINS)
        gene_directory_path = path.join(self.output_directory,
                                        self.GENOME_GENES)
        mkdir(protein_directory_path)
        mkdir(gene_directory_path)
        genome_list = list()
        genome_paths = list()

        for genome in listdir(genome_directory):

            if genome.endswith(self.suffix):
                genome_paths.append(path.splitext(genome)[0])

        logging.info("    - Calling proteins for %i genomes",
                     len(genome_paths))
        cmd = "ls %s/*%s | \
                    sed 's/%s//g' | \
                    grep -o '[^/]*$' | \
                    parallel -j %s \
                        prodigal \
                            -q \
                            -p meta \
                            -o /dev/null \
                            -d %s/{}%s \
                            -a %s/{}%s \
                            -i %s/{}%s \
                            > /dev/null 2>&1" \
                % (genome_directory, self.suffix, self.suffix, self.parallel, gene_directory_path,
                   self.suffix, protein_directory_path, self.PROTEINS_SUFFIX, genome_directory,
                   self.suffix)

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

        protein_directory_files = listdir(protein_directory_path)
        genome_directory_files = listdir(genome_directory)

        for genome_protein, genome_nucl in zip(protein_directory_files,
                                               genome_directory_files):
            genome_protein_base = genome_protein.replace(
                self.PROTEINS_SUFFIX, self.suffix)
            output_genome_protein_path = path.join(protein_directory_path,
                                                   genome_protein)
            output_genome_nucl_path = path.join(genome_directory, genome_nucl)
            output_genome_gene_path = path.join(gene_directory_path,
                                                genome_protein_base)

            genome = (self.light, output_genome_protein_path,
                      output_genome_nucl_path, output_genome_gene_path)
            genome_list.append(genome)

        return genome_list

    def annotate_diamond(self, genomes_list, database, parser_type, ids_type,
                         output_subdirectory):
        '''
        Annotate the proteins encoded by each genome with KO ids using either BLAST or using HMM
        searches (no implemented yet).

        Parameters
        ----------
        genome_faa_directory  - string. Directory containing .faa files for
                                each input genome

        Outputs
        -------
        returns a directory containing the search results for each of the input population genomes,
        and a frequency matrix contining with the KOs as rows, and the genomes as columns.
        '''

        output_directory_path = path.join(self.output_directory,
                                          output_subdirectory)
        genome_dict = {genome.name: genome for genome in genomes_list}
        mkdir(output_directory_path)
        specific_cutoffs = None

        with tempfile.NamedTemporaryFile() as temp:

            to_write = str()

            for genome in genomes_list:
                to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n"

            temp.write(str.encode(to_write))
            temp.flush()
            output_annotation_path = path.join(output_directory_path, self.OUTPUT_DIAMOND) + \
                                        self.ANNOTATION_SUFFIX
            logging.info('    - BLASTing genomes')
            self.diamond_search(temp.name, output_annotation_path, database)

            for genome_name, batch in self.get_batches(output_annotation_path):

                if batch:
                    genome = genome_dict[genome_name]
                    genome.add(batch, self.evalue, self.bit, self.aln_query,
                               self.aln_reference, specific_cutoffs,
                               parser_type, ids_type)

    def get_batches(self, input_file):
        '''
        Separate DIAMOND blast results into batches, where a batch is all the hits for a genome.

        Parameters
        ----------
        input_file - string. Directory to search for blast results.
        '''

        last = None
        input_file_io = open(input_file)

        for line in input_file_io:
            split_line = line.strip().split('\t')
            genome_id, _ = split_line[0].split('~')

            if last is None:
                last = genome_id
                batch = [split_line]

            else:

                if last == genome_id:
                    batch.append(split_line)
                else:
                    yield last, batch
                    batch = [split_line]
                    last = genome_id

        if last is None:
            yield None, None
        else:
            yield last, batch

    def diamond_search(self, tmp_name, output_path, database):
        '''
        Carry out a diamond blastp search.

        Parameters
        ----------
        input_genome_path - string. Path to file containing .faa file for an input genome
        output_path - string. Path to file to output results into
        databases - string. Path to HMM to use for searching
        '''

        cmd = f'bash {tmp_name} | diamond blastp \
                                    --quiet \
                                    --outfmt 6 \
                                    --max-target-seqs 1 \
                                    --query /dev/stdin \
                                    --out {output_path} \
                                    --db {database} \
                                    --threads {self.threads} '

        if self.evalue:
            cmd += f'--evalue {self.evalue} '

        if self.bit:
            cmd += f'--min-score {self.bit} '

        if self.percent_id_cutoff:
            cmd += f'--id {self.percent_id_cutoff*100} '

        if self.aln_query:
            cmd += f"--query-cover {self.aln_query*100} "

        if self.aln_reference:
            cmd += f"--subject-cover {self.aln_reference*100} "

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

    def hmmsearch_annotation(self, genomes_list, output_directory_path,
                             database, ids_type, parser):
        '''
        Annotate the proteins encoded by each genome with pfam ids using HMM searches.

        Parameters
        ----------
        genomes_list - list. list of Genome objects

        '''
        mkdir(output_directory_path)
        genome_dict = {genome.name: genome for genome in genomes_list}

        if ids_type in (AnnotationParser.TIGRFAM, AnnotationParser.PFAM):
            hmmcutoff = True
        else:
            hmmcutoff = False

        if ids_type == AnnotationParser.KO_HMM:
            specific_cutoffs = self.databases.parse_ko_cutoffs()
        else:
            specific_cutoffs = None

        self.hmm_search(output_directory_path, database, hmmcutoff)

        for genome_annotation in listdir(output_directory_path):
            genome_id = path.splitext(genome_annotation)[0]
            genome = genome_dict[genome_id]
            output_annotation_path = path.join(output_directory_path,
                                               genome_annotation)
            genome.add(output_annotation_path, self.evalue, self.bit,
                       self.aln_query, self.aln_reference, specific_cutoffs,
                       parser, ids_type)

    def annotate_hypothetical(self, genomes_list):
        '''
        Sort proteins coded by each genome into homologous clusters.

        Inputs
        ------
        genomes_list - list. list of Genome objects

        '''
        output_directory_path = path.join(self.output_directory,
                                          self.GENOME_HYPOTHETICAL)
        mkdir(output_directory_path)

        with tempfile.NamedTemporaryFile() as temp:

            to_write = str()

            for genome in genomes_list:
                to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n"

            temp.flush()

            tmp_dir = tempfile.mkdtemp()

            db_path = path.join(output_directory_path, "db")
            clu_path = path.join(output_directory_path, "clu")
            align_path = path.join(output_directory_path, "alignDb")
            blast_output_path = path.join(output_directory_path, "alignDb.m8")
            formatted_blast_output_path = path.join(output_directory_path,
                                                    "alignDb.formatted.m8")

            clu_tsv_path = path.join(output_directory_path,
                                     "hypothetical_clusters.tsv")

            logging.info('    - Generating MMSeqs2 database')
            cmd = "bash %s | sponge | mmseqs createdb /dev/stdin %s -v 0 > /dev/null 2>&1" % (
                temp.name, db_path)
            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

            logging.info('    - Clustering genome proteins')
            cmd = f"mmseqs cluster \
                        {db_path} \
                        {clu_path} \
                        {tmp_dir} \
                        --max-seqs 1000 \
                        --threads {self.threads} \
                        --min-seq-id {self.percent_id_cutoff} \
                        -e {self.evalue} \
                        -c {self.fraction_aligned} \
                        -v 0 "

            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

            logging.info('    - Extracting clusters')
            cmd = 'mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1' % (
                db_path, db_path, clu_path, clu_tsv_path)
            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

            logging.info(
                '    - Computing Smith-Waterman alignments for clustering results'
            )
            cmd = "mmseqs alignall %s %s %s --alignment-mode 3 -v 0  " % (
                db_path, clu_path, align_path)
            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

            logging.info('    - Converting to BLAST-like output')
            cmd = "mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1   " % (
                db_path, db_path, align_path, blast_output_path)
            # --format-output query,target,bits
            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

            logging.info('    - Reformatting BLAST output')
            cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \
                                            $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \
                                            printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \
                % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path)
            logging.debug(cmd)
            subprocess.call(cmd, shell=True)
            logging.debug('Finished')

        ortholog_dict = self.run_mcl(formatted_blast_output_path,
                                     output_directory_path)
        ortholog_ids = ortholog_dict.keys()
        cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list,
                                                 ortholog_dict,
                                                 output_directory_path)
        return cluster_ids, ortholog_ids

    def run_mcl(self, blast_abc, output_directory_path):
        '''
        Parse the protein clusters producedf from Mmseqs2 using mcl

        Parameters
        ----------
        blast_abc - string. an abc file for mcl to run on. More information on the format of abc
                    files can be found at https://micans.org/mcl/man/clmprotocols.html
        output_directory_path - string. Path to write the results of mcl parsing to.
        '''

        dict_path = path.join(output_directory_path, "alignDb.dict")
        mci_path = path.join(output_directory_path, "alignDb.mci")
        cluster_path = path.join(output_directory_path, "mcl_clusters.tsv")
        output_path = path.join(output_directory_path,
                                "mcl_clusters.convert.tsv")

        logging.info('    - Preparing network')
        ortholog_dict = dict()
        cmd = f"mcxload \
                    -abc {blast_abc} \
                    -write-tab {dict_path} \
                    -o {mci_path} \
                    --stream-mirror \
                    --stream-neg-log10 \
                    > /dev/null 2>&1"

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

        logging.info('    - Finding orthologs')
        ortholog_dict = dict()
        cmd = f'mcl \
                    {mci_path} \
                    -te {self.threads} \
                    -I {self.inflation} \
                    -o {cluster_path} \
                    > /dev/null 2>&1'

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

        logging.info('    - Reformatting output')
        ortholog_dict = dict()
        cmd = f'mcxdump \
                    -icl {cluster_path} \
                    -o {output_path} \
                    -tabr {dict_path} \
                    > /dev/null 2>&1'

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

        ortholog = 1
        for line in open(output_path):
            ortholog_idx = "ortholog_%i" % ortholog
            ortholog_dict[ortholog_idx] = set()

            for protein in line.strip().split('\t'):
                ortholog_dict[ortholog_idx].add(protein)

            ortholog += 1

        return ortholog_dict

    def parse_cluster_results(self, cluster_output_path, genomes_list,
                              ortholog_dict, output_directory_path):
        '''
        Parse cluster output in tab format.

        Inputs
        ------
        from_cluster_results    - String. Path to mmseqs2 clustering output file

        Yields
        -------
        A cluster name, and a list of sequences in that cluster.

        '''
        logging.info('    - Parsing input cluster file: %s',
                     cluster_output_path)

        cluster_ids = set()
        previous_cluster_name = None
        counter = 0
        genome_dictionary = {genome.name: genome for genome in genomes_list}

        output_hypothetical_annotations = path.join(
            output_directory_path, self.OUTPUT_HYPOTHETICAL_ANNOTATIONS)
        with open(output_hypothetical_annotations, 'w') as out_io:

            for line in open(cluster_output_path):

                cluster_id, member = line.strip().split('\t')
                genome_id, sequence_id = member.split('~')

                if cluster_id == previous_cluster_name:
                    genome_dictionary[genome_id].add_cluster(
                        sequence_id, "cluster_%i" % counter)
                else:
                    counter += 1
                    previous_cluster_name = cluster_id
                    cluster_ids.add("cluster_%i" % counter)
                    genome_dictionary[genome_id].add_cluster(
                        sequence_id, "cluster_%i" % counter)
                out_io.write(
                    '\t'.join([genome_id, sequence_id,
                               "cluster_%i" % counter]) + '\n')

        for ortholog, group in ortholog_dict.items():

            for member in group:
                genome, protein = member.split('~')
                genome_dictionary[genome].add_ortholog(protein, ortholog)

        return cluster_ids

    def _default_hmmsearch_options(self):
        cmd = ''

        if self.bit:
            cmd += '-T %s ' % (str(self.bit))
        else:
            cmd += '-E %s ' % (str(self.evalue))

        return cmd

    def hmm_search(self, output_path, database, hmmcutoff):
        '''
        Carry out a hmmsearch.

        Parameters
        ----------
        input_genome_path     - string. Path to file containing .faa file for
                                an input genome
        output_path           - string. Path to file to output results into
        databases             - string. Path to HMM to use for searching
        '''

        input_genome_path = path.join(self.output_directory,
                                      self.GENOME_PROTEINS)
        cmd = "ls %s | sed 's/%s//g' | parallel -j %s\
                                                hmmsearch \
                                                    --cpu %s \
                                                    -o /dev/null \
                                                    --noali \
                                                    --domtblout %s/{}%s " \
                          % (input_genome_path, self.PROTEINS_SUFFIX, self.parallel,
                             self.threads, output_path, self.ANNOTATION_SUFFIX)
        if hmmcutoff:
            if (self.cut_ga or self.cut_nc or self.cut_tc):

                if self.cut_ga:
                    cmd += " --cut_ga "
                if self.cut_nc:
                    cmd += " --cut_nc "
                if self.cut_tc:
                    cmd += " --cut_tc "
            else:
                cmd += self._default_hmmsearch_options()
        else:
            cmd += self._default_hmmsearch_options()

        cmd += "%s %s/{}.faa 2> /dev/null" % (database, input_genome_path)

        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        logging.debug('Finished')

    def generate_gff_files(self, genomes_list):
        '''
        Write GFF files for each of the genome objects in genomes_list

        Parameters
        ----------
        genomes_list - List. List of Genome objects
        '''
        output_directory_path = path.join(self.output_directory,
                                          self.GENOME_GFF)
        mkdir(output_directory_path)
        for genome in genomes_list:
            logging.info('    - Generating .gff file for %s', genome.name)
            gff_output = path.join(output_directory_path,
                                   genome.name + self.GFF_SUFFIX)
            Writer.write_gff(genome, gff_output)

    def rename_fasta(self, genomes_list):
        '''
        Rename the called proteins with annotation ids.

        Parameters
        ----------
        genomes_list - List. List of Genome objects
        '''
        seqio = SequenceIO()

        for genome in genomes_list:
            file_object, fname = tempfile.mkstemp(suffix='.faa', text=True)

            if genome.gene:
                fd_gene, fname_gene = tempfile.mkstemp(suffix='.fna',
                                                       text=True)

                with open(fname_gene, 'w') as out_gene_io:

                    for description, sequence in seqio.each(open(genome.gene)):
                        name = description.partition(' ')[0]
                        annotations = ' '.join(
                            genome.sequences[name].all_annotations())
                        out_gene_io.write(">%s %s\n" % (name, annotations))
                        out_gene_io.write(genome.sequences[name].gene + '\n')

                close(fd_gene)
                logging.debug('Moving %s to %s', fname_gene, genome.gene)
                shutil.move(fname_gene, genome.gene)

            with open(fname, 'w') as out_io:

                for description, sequence in seqio.each(open(genome.path)):
                    name = description.partition(' ')[0]
                    annotations = ' '.join(
                        genome.sequences[name].all_annotations())
                    out_io.write(">%s %s\n" % (name, annotations))
                    out_io.write(str(sequence) + '\n')

            close(file_object)
            logging.debug('Moving %s to %s', fname, genome.path)
            shutil.move(fname, genome.path)

    def pickle_objects(self, genomes_list):
        '''
        Store annotated genome objects as pickles.

        Parameters
        ----------
        genomes_list - List. List of Genome objects
        '''
        output_directory_path = path.join(self.output_directory,
                                          self.GENOME_OBJ)
        mkdir(output_directory_path)
        for genome in genomes_list:
            genome_pickle_path = path.join(output_directory_path,
                                           genome.name + self.PICKLE_SUFFIX)
            with open(genome_pickle_path, 'wb') as output:
                pickle.dump(genome, output)

    def list_splitter(self, input_list, chunk_number, chunk_max):
        """
        An iterator that separates a list into a number of smaller lists
        (chunk_number). Maximum size for the sub-lists can also be
        specified (chunk_max)
        """
        list_size = float(len(input_list))
        chunk_size = int(round((list_size / chunk_number), 0))

        if chunk_size > chunk_max:
            chunk_size = chunk_max
        elif chunk_size < 1:
            chunk_size = list_size

        while list_size > 0:

            if len(input_list) <= chunk_size:
                yield input_list
                del input_list
            else:
                yield input_list[:chunk_size]
                del input_list[:chunk_size]

            try:
                list_size = len(input_list)
            except NameError:
                list_size = 0

    def parse_genome_inputs(self, genome_directory, protein_directory,
                            genome_files, protein_files):
        '''
        Inputs
        ------

        Outputs
        -------

        '''

        prep_genomes_list = list()
        genomes_list = list()

        if protein_directory:
            logging.info("Using provided proteins")
            protein_genome_list = list()

            for protein_file in listdir(protein_directory):
                protein_genome_list.append(
                    path.join(protein_directory, protein_file))

            directory = self.prep_genome(
                protein_genome_list,
                path.join(self.output_directory, self.GENOME_PROTEINS))

            for genome_proteins_file in listdir(directory):

                if genome_proteins_file.endswith(self.suffix):
                    genome = (self.light,
                              path.join(directory,
                                        genome_proteins_file), None, None)
                    prep_genomes_list.append(genome)

        elif protein_files:
            logging.info("Using provided proteins")
            genome_proteins_path = path.join(self.output_directory,
                                             self.GENOME_PROTEINS)
            directory = self.prep_genome(protein_files, genome_proteins_path)

            for protein_file in listdir(directory):
                protein_file_path = path.join(directory,
                                              path.basename(protein_file))
                prep_genomes_list.append(
                    (self.light, protein_file_path, None, None))

        elif genome_directory:
            logging.info("Calling proteins for annotation")
            prep_genomes_list = self.call_proteins(genome_directory)
            directory = genome_directory

        elif genome_files:
            logging.info("Calling proteins for annotation")
            directory = self.prep_genome(
                genome_files, path.join(self.output_directory,
                                        self.GENOME_BIN))
            prep_genomes_list = self.call_proteins(directory)

        for chunk in self.list_splitter(prep_genomes_list, self.chunk_number,
                                        self.chunk_max):
            genomes_list += self.pool.map(parse_genomes, chunk)

        return genomes_list

    def annotate_pipeline(self, genome_directory, protein_directory,
                          genome_files, protein_files):
        '''
        Run Annotate pipeline for enrichM

        Parameters
        ----------
        genome_directory    - String. Path to directory containing genomes
        protein_directory   - String. Path to directory containing proteins (.faa files) for genomes
        genome_files        - List. List of strings, each to a .fna genome file.
        protein_files       - List. List of strings, each to a .faa proteins file.
        '''

        logging.info("Running pipeline: annotate")
        logging.info("Setting up for genome annotation")
        genomes_list = self.parse_genome_inputs(genome_directory,
                                                protein_directory,
                                                genome_files, protein_files)

        if genomes_list:
            logging.info("Starting annotation:")

            if (self.annotate_cluster or self.annotate_ortholog):
                logging.info(
                    '    - Annotating genomes with hypothetical clusters')
                cluster_ids, ortholog_ids = self.annotate_hypothetical(
                    genomes_list)

                logging.info('    - Generating hypotheticals frequency table')
                matrix_generator = MatrixGenerator(
                    MatrixGenerator.HYPOTHETICAL, cluster_ids)
                freq_table = path.join(self.output_directory,
                                       self.OUTPUT_CLUSTER)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

                if self.annotate_ortholog:
                    matrix_generator = MatrixGenerator(
                        MatrixGenerator.ORTHOLOG, ortholog_ids)
                    freq_table = path.join(self.output_directory,
                                           self.OUTPUT_ORTHOLOG)
                    matrix_generator.write_matrix(genomes_list,
                                                  self.count_domains,
                                                  freq_table)

            if self.annotate_ko:
                annotation_type = AnnotationParser.BLASTPARSER
                logging.info(
                    '    - Annotating genomes with ko ids using DIAMOND')
                self.annotate_diamond(genomes_list, self.databases.KO_DB,
                                      annotation_type, AnnotationParser.KO,
                                      self.GENOME_KO)

                logging.info('    - Generating ko frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.KO)
                freq_table = path.join(self.output_directory, self.OUTPUT_KO)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if self.annotate_ko_hmm:
                annotation_type = AnnotationParser.HMMPARSER
                logging.info('    - Annotating genomes with ko ids using HMMs')
                self.hmmsearch_annotation(
                    genomes_list,
                    path.join(self.output_directory,
                              self.GENOME_KO_HMM), self.databases.KO_HMM_DB,
                    AnnotationParser.KO, annotation_type)

                logging.info('    - Generating ko frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.KO)
                freq_table = path.join(self.output_directory,
                                       self.OUTPUT_KO_HMM)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if self.annotate_ec:
                annotation_type = AnnotationParser.BLASTPARSER
                logging.info('    - Annotating genomes with ec ids')
                self.annotate_diamond(genomes_list, self.databases.EC_DB,
                                      annotation_type, AnnotationParser.EC,
                                      self.GENOME_EC)

                logging.info('    - Generating ec frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.EC)
                freq_table = path.join(self.output_directory, self.OUTPUT_EC)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if self.annotate_pfam:
                annotation_type = AnnotationParser.HMMPARSER
                logging.info('    - Annotating genomes with pfam ids')
                self.hmmsearch_annotation(
                    genomes_list,
                    path.join(self.output_directory,
                              self.GENOME_PFAM), self.databases.PFAM_DB,
                    AnnotationParser.PFAM, annotation_type)

                logging.info('    - Generating pfam frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.PFAM)
                freq_table = path.join(self.output_directory, self.OUTPUT_PFAM)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if self.annotate_tigrfam:
                annotation_type = AnnotationParser.HMMPARSER
                logging.info('    - Annotating genomes with tigrfam ids')
                self.hmmsearch_annotation(
                    genomes_list,
                    path.join(self.output_directory,
                              self.GENOME_TIGRFAM), self.databases.TIGRFAM_DB,
                    AnnotationParser.TIGRFAM, annotation_type)

                logging.info('    - Generating tigrfam frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.TIGRFAM)
                freq_table = path.join(self.output_directory,
                                       self.OUTPUT_TIGRFAM)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if self.annotate_cazy:
                annotation_type = AnnotationParser.HMMPARSER
                logging.info('    - Annotating genomes with CAZY ids')
                self.hmmsearch_annotation(
                    genomes_list,
                    path.join(self.output_directory,
                              self.GENOME_CAZY), self.databases.CAZY_DB,
                    AnnotationParser.CAZY, annotation_type)

                logging.info('    - Generating CAZY frequency table')
                matrix_generator = MatrixGenerator(MatrixGenerator.CAZY)
                freq_table = path.join(self.output_directory, self.OUTPUT_CAZY)
                matrix_generator.write_matrix(genomes_list, self.count_domains,
                                              freq_table)

            if hasattr(list(genomes_list[0].sequences.values())[0], "prod_id"):
                logging.info('Generating .gff files:')
                self.generate_gff_files(genomes_list)

                logging.info('Renaming protein headers')
                self.rename_fasta(genomes_list)

            if not self.light:
                logging.info('Storing genome objects')
                self.pickle_objects(genomes_list)

            logging.info('Finished annotation')

        else:
            logging.error('No files found with %s suffix in input directory',
                          self.suffix)
class NetworkAnalyser:
    """
    Prepare metagenome, metatranscriptome, metabolomic data for constructing 
    SIF network files.
    """
    MATRIX          = 'matrix'
    NETWORK         = 'network'
    EXPLORE         = 'explore'
    DEGRADE         = 'degrade'
    PATHWAY         = 'pathway'
    ANNOTATE        = 'annotate'
    ENRICHMENT      = 'enrichment'
    MODULE_AB       = 'module_ab'
    TRAVERSE        = 'traverse'

    NETWORK_OUTPUT_FILE  = 'network.tsv'
    METADATA_OUTPUT_FILE = 'metadata.tsv'
    TRAVERSE_OUTPUT_FILE = 'traverse.tsv'

    def __init__(self):
        self.databases = Databases()
        self.reactions = self.databases.r()
        self.reaction_to_ko = self.databases.r2k()

    def average(self, input_dictionary):
        '''
        Take the average of the values of a dictionary of dictionaries
        '''
        
        for sample_group, group_dict in input_dictionary.items():

            for group, reaction_dict in group_dict.items():

                for reaction, value in reaction_dict.items():
                    input_dictionary[sample_group][group][reaction] = sum(value) / len(value)

        return input_dictionary

    def median_genome_abundance(self, sample_abundance_dict, sample_metadata):
        """
        Create a dictionary with the median abundance in sample_abundance_dict
        using sample_metadata as a reference.
        """
        
        median_sample_abundance = dict()

        for group, samples in sample_metadata.items():
            median_sample_abundance[group] = dict()
            sample_dictionaries = [sample_abundance_dict[sample] for sample in samples]
            genomes = set(list(itertools.chain(*[list(sample_dictionary.keys()) for sample_dictionary in sample_dictionaries])))

            for genome in genomes:
                abundances = [sample_dictionary[genome] for sample_dictionary in sample_dictionaries]
                median_sample_abundance[group][genome] = statistics.median(abundances)

        return median_sample_abundance

    def normalise_by_abundance(self, median_sample_abundances, reaction_abundance_dict, group_to_genome, genome_to_group, genome_groups):

        normalised_abundance_dict = dict()
        for sample_group in list(median_sample_abundances.keys()):
            normalised_abundance_dict[sample_group] = dict()

            for genome_group in genome_groups:
                normalised_abundance_dict[sample_group][genome_group] = dict()

        for sample_group, genome_abundances in median_sample_abundances.items():

            for genome, genome_abundance in genome_abundances.items():

                if(genome in genome_to_group and
                   genome in reaction_abundance_dict):

                    for reaction in list(reaction_abundance_dict[genome].keys()):

                        normalised_value = reaction_abundance_dict[genome][reaction]*genome_abundance

                        genome_group = next(iter(genome_to_group[genome]))

                        if reaction in normalised_abundance_dict[sample_group][genome_group]:
                            normalised_abundance_dict[sample_group][genome_group][reaction].append( normalised_value )
                        else:
                            normalised_abundance_dict[sample_group][genome_group][reaction] = [normalised_value]

        return normalised_abundance_dict

    def average_tpm_by_sample(self, tpm_results, sample_metadata):
        output_dict = dict()
        tpm_dict, annotations, genomes = tpm_results

        for group, samples in sample_metadata.items():
            output_dict[group] = dict()

            for sample in samples:

                for annotation in annotations:

                    if str.encode(sample) in tpm_dict:

                        for genome in genomes:

                            if genome not in output_dict[group]:
                                output_dict[group][genome] = dict()

                            if annotation not in output_dict[group][genome]:
                                output_dict[group][genome][annotation] = list()

                            if genome in tpm_dict[str.encode(sample)]:

                                if annotation in tpm_dict[str.encode(sample)][genome]:
                                    output_dict[group][genome][annotation].append(tpm_dict[str.encode(sample)][genome][annotation])
                                else:
                                    output_dict[group][genome][annotation].append(0.0)

                            else:
                                output_dict[group][genome][annotation].append(0.0)

            for genome, values in output_dict[group].items():

                for annotation in values:
                    output_dict[group][genome][annotation] = sum(output_dict[group][genome][annotation])/len(output_dict[group][genome][annotation])

        return output_dict

    def average_tpm_values(self, transriptome_abundance_dict, group_metadata):
        output_dict = dict()
        reactions = list(self.reactions.keys())
        
        for genome_group_name, group_reaction_abundance_dict in transriptome_abundance_dict.items():
            output_dict[genome_group_name] = dict()

            for group, members in group_metadata.items():
                output_dict[genome_group_name][group] = dict()

                for reaction in reactions:
                    to_average = list()

                    for member in members:

                        if member in group_reaction_abundance_dict:

                            if str.encode(reaction) in group_reaction_abundance_dict[member]:
                                to_average.append(group_reaction_abundance_dict[member][str.encode(reaction)])
                            else:
                                to_average.append(0.0)
                        else:
                            to_average.append(0.0)

                    average_value = sum(to_average) / len(to_average)
                    output_dict[genome_group_name][group][reaction] = average_value

        return output_dict

    def aggregate_dictionary(self, reference_dict, matrix_dict):

        output_dict_mean   = dict()

        for sample, ko_abundances in matrix_dict.items():
            output_dict_mean[sample]   = dict()

            for reaction, ko_list in reference_dict.items():
                abundances = list()

                for ko in ko_list:

                    if ko in ko_abundances:
                        if ko_abundances[ko]>0:
                            abundances.append(ko_abundances[ko])

                    else:
                        logging.debug("ID not found in input matrix: %s" % ko)

                if any(abundances):
                    abundance_mean = sum(abundances)/len(abundances) # average of the abundances...

                else:
                    abundance_mean = 0

                output_dict_mean[sample][reaction] = abundance_mean

        return output_dict_mean

    def mock_metadata(self, genomes):
        genome_to_group = {genome:set([genome]) for genome in genomes}
        genome_groups = set(genomes)
        group_to_genome = dict(genome_to_group) # Make a copy here
        return genome_to_group, genome_groups, group_to_genome

    def network_pipeline(self,
           subparser_name,
           matrix, genome_metadata_path,
           transcriptome_abundances_path, transcriptome_metadata_path,
           metagenome_abundances, metagenome_metadata_path,
           metabolome,
           enrichment_output,
           depth, filter, limit, queries, output_directory):
        '''
        Parameters
        ----------
        matrix
        transcriptome_abundances_path
        metagenome_abundances
        metagenome_metadata_path
        metabolome
        enrichment_output
        depth
        filter
        limit
        queries
        output_directory
        '''
        orthology_matrix, genome_names, _ = Parser.parse_simple_matrix(matrix)
        if genome_metadata_path:
            genome_to_group, genome_groups, group_to_genome = \
                    Parser.parse_metadata_matrix(genome_metadata_path)
        else:
            genome_to_group, genome_groups, group_to_genome = \
                    self.mock_metadata(genome_names)

        reaction_matrix = self.aggregate_dictionary(self.reaction_to_ko, orthology_matrix)

        # Read in fisher results
        if enrichment_output:
            logging.info('Parsing input enrichment results')
            fisher_results = Parser.parse_enrichment_output(enrichment_output)
        else:
            logging.info('No enrichment results provided')
            fisher_results = None

        # Read in metabolome abundances
        if metabolome:
            logging.info('Parsing metabolome abundances')
            abundances_metabolome = Parser.parse_simple_matrix(metabolome)
        else:
            logging.info('No metabolome abundances provided')
            abundances_metabolome = None

        # Read in genome metagenome_abundances
        if metagenome_abundances:
            logging.info('Parsing input genome abundances')
            sample_abundance = Parser.parse_simple_matrix(metagenome_abundances)[0]
            sample_metadata = Parser.parse_metadata_matrix(metagenome_metadata_path)[2]
        else:
            # FIXME : There's always a better way than faking it.
            logging.info('No genome abundances provided')
            sample_abundance = {'MOCK': {x:1 for x in list(reaction_matrix.keys())} }
            sample_metadata = {"abundance": ['MOCK']}

        median_sample_abundances = self.median_genome_abundance(sample_abundance, sample_metadata)
        normalised_abundance_dict = self.normalise_by_abundance(median_sample_abundances, reaction_matrix, group_to_genome, genome_to_group, genome_groups)
        abundances_metagenome = self.average(normalised_abundance_dict)

        # Read in expression (TPM) values
        if transcriptome_abundances_path:
            logging.info("Parsing detectM TPM abundances")
            transcriptome_metadata = Parser.parse_metadata_matrix(transcriptome_metadata_path)[2]
            transcriptome_abundance_dict = self.average_tpm_by_sample(Parser.parse_tpm_values(transcriptome_abundances_path), transcriptome_metadata)
            transcriptome_abundances = self.average_tpm_values(transcriptome_abundance_dict, group_to_genome)
        else:
            transcriptome_abundances = None

        network_builder = NetworkBuilder(group_to_genome, abundances_metagenome,
                                         transcriptome_abundances, abundances_metabolome, 
                                         fisher_results)

        # Run the subcommand specified
        if subparser_name == self.EXPLORE:
            network_lines, node_metadata = network_builder.query_matrix(queries, depth)
        elif subparser_name == self.PATHWAY:
            network_lines, node_metadata = network_builder.pathway_matrix(limit, filter)

        # Write the outputs
        Writer.write(network_lines, os.path.join(output_directory, self.NETWORK_OUTPUT_FILE))
        Writer.write(node_metadata, os.path.join(output_directory, self.METADATA_OUTPUT_FILE))
        
        logging.info('Finished the %s pipeline' % subparser_name)