Beispiel #1
0
    def _organize_matrix(self):
        Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "")
        tp, fn, fp, tn, p, r = self._get_name_and_rank()

        names = self.get_matrix_names()
        TP = self._get_true_positives(tp)
        FN = self._get_false_negatives(fn)
        FP = self._get_false_positives(fp)
        TN = self._get_true_negatives(tn)

        all_tax_ids = set(TP.keys()) | set(FN.keys()) | set(FP.keys()) | set(
            TN.keys())
        matrix_sum = Juice.matrix_sum()

        for tax_id in all_tax_ids:
            for name in names:
                if (name not in TP[tax_id]) and (name not in FN[tax_id]) and (
                        name not in FP[tax_id]) and (name not in TN[tax_id]):
                    TN[tax_id][name] = matrix_sum

                if (name not in TP[tax_id]):
                    TP[tax_id][name] = 0
                if name not in FN[tax_id]:
                    FN[tax_id][name] = 0
                if name not in FP[tax_id]:
                    FP[tax_id][name] = 0
        return TP, FN, FP, TN, p, r
Beispiel #2
0
 def save_matrices_as_csv(self, file_path):
     Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "")
     for name in self.matrix_dict:
         if self.saved[name] == False:
             Juice.set_file_name(os.path.join(self.input_path, name))
             self.create_table(name)
             csv_name = os.path.join(file_path, self.cm_truth + " " + name)
             Juice.save_matrix_table(self.matrix_tables[name], csv_name)
     return
Beispiel #3
0
 def create_table(self, name=""):
     Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "")
     if name == "":
         for name in self.matrix_dict:
             if name not in self.matrix_tables:
                 Juice.set_file_name(os.path.join(self.input_path, name))
                 self.matrix_tables[name] = Juice.create_matrix_table(
                     Juice.reformat_matrix(
                         Juice.add_other_info(self.matrix_dict[name])))
     elif name in self.matrix_dict:
         Juice.set_file_name(os.path.join(self.input_path, name))
         self.matrix_tables[name] = Juice.create_matrix_table(
             Juice.reformat_matrix(
                 Juice.add_other_info(self.matrix_dict[name])))
     else:
         print("There is no matrix by the name \'{}\'".format(name))
     return
Beispiel #4
0
    def main(self,
             gnd_truth,
             excel_name="TaxaPerformanceMetrics_byTool",
             gen_dir="",
             file_path="",
             csv=0,
             dendros=0):
        gen_paths = glob(os.path.join(gen_dir, "*.profile"))
        self.input_path = gen_dir
        self.output_path = file_path
        self.output_name = excel_name

        Juice = cm.Confusion(os.path.join(self.input_path, gnd_truth), "")
        self.set_truth(gnd_truth)

        for path in gen_paths:
            name = os.path.basename(path)
            if name not in self.matrix_dict:
                Juice.set_file_name(path)
                self.add_matrix(name, Juice.main("no"))

        if csv == 1:
            self.save_matrices_as_csv(self.output_path)

        self.save_as_excel(self.output_path, excel_name)

        # Dendrograms
        if dendros == 1:
            sheets = [
                "True Positives", "False Negatives", "False Positives",
                "True Negatives", "Precall"
            ]
            for sheet in sheets:
                ranks = self.read_excel(
                    sheet, os.path.join(self.output_path,
                                        excel_name + ".xlsx"))
                ranks.append("")
                for rank in ranks:
                    self.create_dendrogram(
                        sheet, rank,
                        os.path.join(self.output_path, excel_name + ".xlsx"))
            print("\nThe Dendrograms have been saved in {}.".format(
                self.output_path))

        return
Beispiel #5
0
    def get_top_taxid(self, x, metric='tp', difficulty='least'):
        excel_name = os.path.join(self.output_path, self.output_name) + '.xlsx'
        metric_df = pd.DataFrame()

        metric_df['Tax ID'] = pd.read_excel(excel_name,
                                            sheet_name='Precision',
                                            engine='openpyxl')['Tax ID']
        names = pd.read_excel(excel_name,
                              sheet_name='True Positives',
                              engine='openpyxl')['name']
        metric_df['Names'] = [re.split('\|', name).pop() for name in names]
        if metric.lower() == 'precall':
            metric_df['Pre-Agg'] = pd.read_excel(
                excel_name, sheet_name='Precision',
                engine='openpyxl')['Aggregate']
            metric_df['Re-Agg'] = pd.read_excel(excel_name,
                                                sheet_name='Recall',
                                                engine='openpyxl')['Aggregate']
            metric_df['Average'] = (metric_df['Pre-Agg'] +
                                    metric_df['Re-Agg']) / 2
            base = 'Average'
        elif metric.lower() == 'tp':
            base = 'TP-Agg'
            metric_df[base] = pd.read_excel(excel_name,
                                            sheet_name='True Positives',
                                            engine='openpyxl')['Aggregate']
        elif metric.lower() == 'fn':
            base = 'FN-Agg'
            metric_df[base] = pd.read_excel(excel_name,
                                            sheet_name='False Negatives',
                                            engine='openpyxl')['Aggregate']
        elif metric.lower() == 'fp':
            base = 'FP-Agg'
            metric_df[base] = pd.read_excel(excel_name,
                                            sheet_name='False Positives',
                                            engine='openpyxl')['Aggregate']
        elif metric.lower() == 'tn':
            base = 'TN-Agg'
            metric_df[base] = pd.read_excel(excel_name,
                                            sheet_name='True Negatives',
                                            engine='openpyxl')['Aggregate']

        # Filtering out taxids not in ground truth
        Juice = cm.Confusion(self.cm_truth, '')
        Tea = cm.comp.Comparator()
        Chai = cm.comp.pp.Parser()
        # create set of taxids not in the ground truth
        untrue_taxids = Juice.dictionary_to_set(
            Tea.save_tax_ID(
                Chai.main(os.path.join(self.input_path,
                                       self.cm_truth)))) ^ set(
                                           metric_df['Tax ID'])
        untrue_rows = []
        for utt in untrue_taxids:
            # find the index of the rows for untrue taxids
            untrue_rows.append(metric_df[metric_df['Tax ID'] == utt].index[0])
            # drop the rows for untrue taxids
        metric_df.drop(untrue_rows, inplace=True)

        if difficulty.lower() == 'most':
            order = False  # for descending
        elif difficulty.lower() == 'least':
            order = True  # for ascending

        needed_df = metric_df.sort_values(by=base,
                                          ascending=order,
                                          na_position='last').iloc[0:x, :]
        fn = 'Top_' + difficulty.capitalize() + '-' + metric.upper(
        ) + '_taxid.xlsx'
        needed_df.to_excel(os.path.join(self.output_path, fn), index=False)
        print('\nSaved as {}'.format(os.path.join(self.output_path, fn)))
        return os.path.join(self.output_path, fn)
Beispiel #6
0
    def trace_back(self, metric):
        Chai = cm.comp.pp.Parser()
        true_samples = pd.DataFrame.from_dict(
            Chai.main(os.path.join(self.input_path, self.cm_truth), 1))
        preds = self.get_matrix_names()
        true_data = {}
        data = {}

        # to get true data
        for col in true_samples.columns:
            for ind in true_samples.index:
                if col not in true_data:
                    true_data[col] = set()
                if isinstance(true_samples.loc[ind, col], Iterable):
                    taxids = [
                        tax
                        for tax in re.split('\|', true_samples.loc[ind,
                                                                   col][-1])
                        if tax != ''
                    ]
                    true_data[col] = true_data[col] | set(taxids)

        # to get predicted data
        for name in preds:
            data[name] = {}
            matrix = pd.DataFrame.from_dict(
                Chai.main(os.path.join(self.input_path, name), 1))
            for col in matrix.columns:
                for ind in matrix.index:
                    if col not in data[name]:
                        data[name][col] = set()
                    if isinstance(matrix.loc[ind, col], Iterable):
                        taxids = [
                            tax
                            for tax in re.split('\|', matrix.loc[ind, col][-1])
                            if tax != ''
                        ]
                        data[name][col] = data[name][col] | set(taxids)

        # turn predicted data into confusion matrices
        Tea = cm.comp.Comparator()
        Juice = cm.Confusion('', '')
        new_matrix = {}
        for name in data:
            combined_taxid = Tea.combine_tax_ID(true_data, data[name])
            new_matrix[name] = Juice.confusion_matrix(true_data, data[name],
                                                      combined_taxid)
        matrix_df = pd.DataFrame.from_dict(new_matrix, orient='index')

        # make a data frame for the correct metric
        df = pd.DataFrame()
        if metric == 'True Positives':
            TP = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in TP:
                        TP[tax_id] = {}
                    if name not in TP[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            TP[tax_id][name] = matrix_df.loc[tax_id, name][0]
            df = pd.DataFrame.from_dict(TP).fillna(0)
        elif metric == 'False Negatives':
            FN = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in FN:
                        FN[tax_id] = {}
                    if name not in FN[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            FN[tax_id][name] = matrix_df.loc[tax_id, name][1]
            df = pd.DataFrame.from_dict(FN).fillna(0)
        elif metric == 'False Positives':
            FP = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in FP:
                        FP[tax_id] = {}
                    if name not in FP[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            FP[tax_id][name] = matrix_df.loc[tax_id, name][2]
            df = pd.DataFrame.from_dict(FP).fillna(0)
        elif metric == 'True Negatives':
            TN = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in TN:
                        TN[tax_id] = {}
                    if name not in TN[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            TN[tax_id][name] = matrix_df.loc[tax_id, name][3]
            df = pd.DataFrame.from_dict(TN).fillna(0)
        elif metric == 'Precision':  # TP / (TP + FP)
            P = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in P:
                        P[tax_id] = {}
                    if name not in P[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            tp = matrix_df.loc[tax_id, name][0]
                            fp = matrix_df.loc[tax_id, name][2]
                            if (tp == 0) and (fp == 0):
                                P[tax_id][name] = np.nan
                            else:
                                P[tax_id][name] = (tp) / (tp + fp)
            df = pd.DataFrame.from_dict(P)
        elif metric == 'Recall':  # TP / (TP + FN)
            R = {}
            for name in matrix_df.columns:
                for tax_id in matrix_df.index:
                    if tax_id not in R:
                        R[tax_id] = {}
                    if name not in R[tax_id]:
                        if isinstance(matrix_df.loc[tax_id, name], Iterable):
                            tp = matrix_df.loc[tax_id, name][0]
                            fn = matrix_df.loc[tax_id, name][1]
                            if (tp == 0) and (fn == 0):
                                R[tax_id][name] = np.nan
                            else:
                                R[tax_id][name] = (tp) / (tp + fn)
            df = pd.DataFrame.from_dict(R)

        return df