def fetch_from_vertica_to_df(self, data_set, query):
        data_set_query = self.get_data_set_attribute(data_set, 'query')
        if data_set_query != query:
            connection = connect(self.connection_details)
            cursor = connection.cursor()
            print 'Executing ', data_set, 'Query...'
            print query
            columns = get_column_names_from_sql_query(query)
            cursor.execute(query)

            data = []
            while True:
                rows = cursor.fetchmany(10000)
                data.extend([[str(ele) for ele in row] for row in rows])
                if len(rows) <= 1:
                    break

            df = MetadataDataFrame(data=data,
                                   columns=columns,
                                   meta_info={
                                       'query': query,
                                       'built_features': [],
                                       'aggregate_values': {},
                                       'columns': columns
                                   })

            cursor.close()
            if len(df) == 0:
                raise (ValueError('SQL result in empty fetch!!'))
            else:
                self.set_data_set_attribute(data_set, 'data', df)
                self.set_data_set_attribute(data_set, 'query', query)
                self.set_data_set_attribute(data_set, 'columns', columns)
                self.set_data_set_attribute(data_set, 'built_features', [])
Example #2
0
 def load_results(self):
     if os.path.isfile(self.datafile):
         self.result = Pickle.load(open(self.datafile, 'rb'))
     elif self.meta_info is not None:
         columns = ['id', 'model', 'learner'] + list(
             set(self.meta_info['features'])) + self.meta_info[
                 'metrics'] + self.meta_info['learner_parameters']
         self.result = MetadataDataFrame(columns=columns,
                                         meta_info=self.meta_info)
     else:
         raise (RuntimeError('No results exist to Summarize!!'))
Example #3
0
    def add_metrics(self, model_name, learner_name, features, params_list,
                    metrics_list):
        rows = defaultdict(list)
        for params, metrics in zip(params_list, metrics_list):
            if not self.check_if_exists(model_name, learner_name, features,
                                        params):
                row = self.get_row_dict(model_name, learner_name, features,
                                        params, metrics)
                [rows[key].append(value) for key, value in row.items()]

        self.result = self.result.append(MetadataDataFrame(data=rows),
                                         ignore_index=True)
        Pickle.dump(self.result, open(self.datafile, 'wb'))
Example #4
0
    def plot_results(self):
        group_by_keys = ['model', 'learner'] + list(
            set(self.result.meta_info['features']))
        result = MetadataDataFrame(columns=group_by_keys)
        for grouped_by, data_frame in self.result.groupby(group_by_keys):
            result = result.append(data_frame[group_by_keys + ['AUC']][
                data_frame['AUC'] == data_frame['AUC'].max()],
                                   ignore_index=True)

        result = result.sort(['AUC'], ascending=[1]).reset_index(drop=True)

        result_list = []
        for model in result['model'].unique():
            result_model = result[result['model'] == model]
            rows_to_keep = [True]
            index = result_model.index.tolist()
            if len(result_model) > 1:
                base = result_model['AUC'][index[0]]
                for i in range(1, len(result_model)):
                    if result_model['AUC'][index[i]] > base + 0.0001:
                        rows_to_keep.append(True)
                        base = result_model['AUC'][index[i]]
                    else:
                        rows_to_keep.append(False)
            result_list.append(result_model[rows_to_keep])

        result = result_list[0]
        if len(result_list) > 1:
            for i in range(1, len(result_list)):
                result = result.append(result_list[i], ignore_index=True)

        result = result.sort(['AUC'], ascending=[1]).reset_index(drop=True)

        if len(result) == 1:
            return

        key_freq = defaultdict(int)
        for key in result.keys():
            distinct_values = self.result[key].unique()
            if len(distinct_values) == 1:
                del result[key]
            else:
                for val in distinct_values:
                    key_freq[key] = max(
                        key_freq[key],
                        len(self.result[key][self.result[key] == val]))

        sorted_keys = [
            key[0]
            for key in sorted(key_freq.items(), key=operator.itemgetter(1))
        ]
        group_by_keys = [key for key in sorted_keys if key in group_by_keys]
        default_annotation = {'weight': 'bold', 'ha': 'center', 'va': 'center'}

        fig = plt.figure(figsize=(len(result.keys()) / 2, len(result) / 2))
        plt.subplots_adjust(left=0.1,
                            right=1.0,
                            bottom=0.0,
                            top=1.0,
                            wspace=0.2,
                            hspace=0.0)

        index = 0
        for key in group_by_keys:
            sorted_values = result[key].tolist()
            color_map, color_map_r = COLOR_MAP.next()
            width = 3 if key in ['model', 'learner'] else 1
            for i in range(len(result)):
                if result[key][i] == 'present':
                    plt.gca().add_patch(
                        Rectangle(xy=(index, i),
                                  facecolor='green',
                                  width=width,
                                  height=1,
                                  alpha=0.5))

                elif result[key][i] == 'absent':
                    plt.gca().add_patch(
                        Rectangle(xy=(index, i),
                                  facecolor='red',
                                  width=width,
                                  height=1,
                                  alpha=0.5))
                else:
                    color_value = color_map(
                        sorted_values.index(result[key][i]) /
                        float(len(result) - 1))
                    color_value_r = color_map_r(
                        sorted_values.index(result[key][i]) /
                        float(len(result) - 1))
                    default_annotation.update({
                        'rotation': '00',
                        'fontsize': 8
                    })
                    plt.gca().add_patch(
                        Rectangle(xy=(index, i),
                                  facecolor=color_value,
                                  width=width,
                                  height=1))
                    plt.gca().annotate(sorted_values[i],
                                       (index + +float(width) / 2, i + 0.5),
                                       color=color_value_r,
                                       **default_annotation)

            default_annotation.update({'rotation': '90', 'fontsize': 9})
            plt.gca().add_patch(
                Rectangle(xy=(index, len(result)),
                          facecolor=color_map(1),
                          width=width,
                          height=15))
            plt.gca().annotate(key,
                               (index + float(width) / 2, len(result) + 7.5),
                               color='black',
                               **default_annotation)
            index += width

        plt.ylim([0, len(result) + 15])
        plt.xlim([0, index])
        ticks = np.linspace(start=0.5,
                            stop=len(result) - 0.5,
                            endpoint=len(result) - 0.5,
                            num=len(result))
        labels = [round(value, 4) for value in result['AUC'].tolist()]
        plt.gca().get_yaxis().set_ticks(ticks)
        plt.ylabel('AUC')
        plt.gca().get_yaxis().set_ticklabels(labels)

        png_path = '.'.join(self.datafile.split('.')[0:-1]) + '.png'
        fig.savefig(png_path, dpi=fig.dpi)
        print 'Summary saved as', png_path