Ejemplo n.º 1
0
 def setup(self):
     self.data = {}
     self.keep_json_all_mapping = True
     self.conf = ConfigLoader()
     cravat_conf = self.conf.get_cravat_conf()
     if 'viewer_effective_digits' in cravat_conf:
         self.viewer_effective_digits = cravat_conf[
             'viewer_effective_digits']
     else:
         self.viewer_effective_digits = constants.viewer_effective_digits
Ejemplo n.º 2
0
 def __init__(self, *inargs, **inkwargs):
     try:
         main_fpath = os.path.abspath(sys.modules[self.__module__].__file__)
         self.primary_input_path = None
         self.secondary_paths = None
         self.output_dir = None
         self.output_basename = None
         self.plain_output = None
         self.job_conf_path = None
         self.logger = None
         self.dbconn = None
         self.cursor = None
         self._define_cmd_parser()
         self.args = cravat.util.get_args(self.cmd_arg_parser, inargs,
                                          inkwargs)
         self.parse_cmd_args(inargs, inkwargs)
         if hasattr(self.args, "status_writer") == False:
             self.status_writer = None
         else:
             self.status_writer = self.args.status_writer
         if hasattr(self.args, "live") == False:
             live = False
         else:
             live = self.args.live
         self.supported_chroms = set(cannonical_chroms)
         if live:
             return
         main_basename = os.path.basename(main_fpath)
         if "." in main_basename:
             self.module_name = ".".join(main_basename.split(".")[:-1])
         else:
             self.module_name = main_basename
         self.annotator_name = self.module_name
         self.module_dir = os.path.dirname(main_fpath)
         self.annotator_dir = os.path.dirname(main_fpath)
         self.data_dir = os.path.join(self.module_dir, "data")
         # Load command line opts
         self._setup_logger()
         config_loader = ConfigLoader(self.job_conf_path)
         self.conf = config_loader.get_module_conf(self.module_name)
         self._verify_conf()
         self._id_col_name = self.conf["output_columns"][0]["name"]
         if "logging_level" in self.conf:
             self.logger.setLevel(self.conf["logging_level"].upper())
         if "title" in self.conf:
             self.annotator_display_name = self.conf["title"]
         else:
             self.annotator_display_name = os.path.basename(
                 self.module_dir).upper()
         if "version" in self.conf:
             self.annotator_version = self.conf["version"]
         else:
             self.annotator_version = ""
     except Exception as e:
         self._log_exception(e)
Ejemplo n.º 3
0
 def __init__(self, *inargs, **inkwargs):
     try:
         main_fpath = os.path.abspath(sys.modules[self.__module__].__file__)
         self.primary_input_path = None
         self.secondary_paths = None
         self.output_dir = None
         self.output_basename = None
         self.plain_output = None
         self.job_conf_path = None
         self.logger = None
         self.dbconn = None
         self.cursor = None
         self._define_cmd_parser()
         self.args = cravat.util.get_args(self.cmd_arg_parser, inargs,
                                          inkwargs)
         self.parse_cmd_args(inargs, inkwargs)
         if hasattr(self.args, 'status_writer') == False:
             self.status_writer = None
         else:
             self.status_writer = self.args.status_writer
         if hasattr(self.args, 'live') == False:
             live = False
         else:
             live = self.args.live
         if live:
             return
         main_basename = os.path.basename(main_fpath)
         if '.' in main_basename:
             self.module_name = '.'.join(main_basename.split('.')[:-1])
         else:
             self.module_name = main_basename
         self.annotator_name = self.module_name
         self.module_dir = os.path.dirname(main_fpath)
         self.annotator_dir = os.path.dirname(main_fpath)
         self.data_dir = os.path.join(self.module_dir, 'data')
         # Load command line opts
         self._setup_logger()
         config_loader = ConfigLoader(self.job_conf_path)
         self.conf = config_loader.get_module_conf(self.module_name)
         self._verify_conf()
         self._id_col_name = self.conf['output_columns'][0]['name']
         if 'logging_level' in self.conf:
             self.logger.setLevel(self.conf['logging_level'].upper())
         if 'title' in self.conf:
             self.annotator_display_name = self.conf['title']
         else:
             self.annotator_display_name = os.path.basename(
                 self.module_dir).upper()
         if 'version' in self.conf:
             self.annotator_version = self.conf['version']
         else:
             self.annotator_version = ''
     except Exception as e:
         self._log_exception(e)
Ejemplo n.º 4
0
 def parse_cmd_args(self, parser, cmd_args):
     cmd_args = clean_args(cmd_args)
     parsed_args = parser.parse_args(cmd_args)
     self.parsed_args = parsed_args
     self.dbpath = parsed_args.dbpath
     self.filterpath = parsed_args.filterpath
     self.filtername = parsed_args.filtername
     self.filterstring = parsed_args.filterstring
     self.confs = None
     if parsed_args.confs is not None:
         confs = parsed_args.confs.lstrip('\'').rstrip('\'').replace(
             "'", '"')
         self.confs = json.loads(confs)
         if 'filter' in self.confs:
             self.filter = self.confs['filter']
         else:
             self.filter = None
     if parsed_args.output_dir is not None:
         self.output_dir = parsed_args.output_dir
     else:
         self.output_dir = os.path.dirname(self.dbpath)
     self.savepath = parsed_args.savepath
     if self.savepath is not None and os.path.dirname(self.savepath) == '':
         self.savepath = os.path.join(self.output_dir, self.savepath)
     self.confpath = parsed_args.confpath
     self.conf = ConfigLoader(job_conf_path=self.confpath)
     self.module_name = parsed_args.module_name
     if self.conf is not None:
         self.module_conf = self.conf.get_module_conf(self.module_name)
     else:
         self.module_conf = None
     if hasattr(parsed_args, 'reporttypes'):
         self.report_types = parsed_args.reporttypes
     self.output_basename = os.path.basename(self.dbpath)[:-7]
     status_fname = '{}.status.json'.format(self.output_basename)
     self.status_fpath = os.path.join(self.output_dir, status_fname)
     self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel
     if parsed_args.inputfiles is None and parsed_args.dbpath is not None:
         db = sqlite3.connect(parsed_args.dbpath)
         c = db.cursor()
         q = 'select colval from info where colkey="_input_paths"'
         c.execute(q)
         r = c.fetchone()
         if r is not None:
             parsed_args.inputfiles = []
             s = r[0]
             if ' ' in s:
                 s = s.replace("'", '"')
             s = json.loads(r[0].replace("'", '"'))
             for k in s:
                 input_path = s[k]
                 parsed_args.inputfiles.append(input_path)
     self.args = parsed_args
Ejemplo n.º 5
0
 def __init__(self, cmd_args):
     # self.module_name = get_caller_name(sys.modules[self.__module__].__file__)
     self.module_name = get_caller_name(cmd_args[0])
     self.parse_cmd_args(cmd_args)
     self._setup_logger()
     config_loader = ConfigLoader()
     self.conf = config_loader.get_module_conf(self.module_name)
     self.fix_col_names()
     self.dbconn = None
     self.cursor = None
     self.cursor_w = None
     self._open_db_connection()
     self.should_run_annotate = self.check()
Ejemplo n.º 6
0
 def parse_cmd_args(self, cmd_args):
     parser = argparse.ArgumentParser()
     parser.add_argument('dbpath', help='Path to aggregator output')
     parser.add_argument('-f',
                         dest='filterpath',
                         default=None,
                         help='Path to filter file')
     parser.add_argument(
         '-F',
         dest='filtername',
         default=None,
         help='Name of filter (stored in aggregator output)')
     parser.add_argument('--filterstring',
                         dest='filterstring',
                         default=None,
                         help='Filter in JSON')
     parser.add_argument('-s',
                         dest='savepath',
                         default=None,
                         help='Path to save file')
     parser.add_argument('-c', dest='confpath', help='path to a conf file')
     parser.add_argument('-t',
                         dest='reporttypes',
                         nargs='+',
                         default=None,
                         help='report types')
     parser.add_argument('--module-name',
                         dest='module_name',
                         default=None,
                         help='report module name')
     parsed_args = parser.parse_args(cmd_args[1:])
     self.parsed_args = parsed_args
     self.dbpath = parsed_args.dbpath
     self.filterpath = parsed_args.filterpath
     self.filtername = parsed_args.filtername
     self.filterstring = parsed_args.filterstring
     self.savepath = parsed_args.savepath
     self.confpath = parsed_args.confpath
     self.conf = ConfigLoader(job_conf_path=self.confpath)
     self.module_name = parsed_args.module_name
     if self.conf is not None:
         self.module_conf = self.conf.get_module_conf(self.module_name)
     else:
         self.module_conf = None
     self.report_types = parsed_args.reporttypes
     self.output_basename = os.path.basename(self.dbpath)[:-7]
     self.output_dir = os.path.dirname(self.dbpath)
     status_fname = '{}.status.json'.format(self.output_basename)
     self.status_fpath = os.path.join(self.output_dir, status_fname)
Ejemplo n.º 7
0
 def parse_cmd_args (self, cmd_args):
     parser = argparse.ArgumentParser()
     parser.add_argument('dbpath',
                         help='Path to aggregator output')
     parser.add_argument('-f',
         dest='filterpath',
         default=None,
         help='Path to filter file')
     parser.add_argument('-F',
         dest='filtername',
         default=None,
         help='Name of filter (stored in aggregator output)')
     parser.add_argument('--filterstring',
         dest='filterstring',
         default=None,
         help='Filter in JSON')
     parser.add_argument('-s',
         dest='savepath',
         default=None,
         help='Path to save file')
     parser.add_argument('-c',
         dest='confpath',
         help='path to a conf file')
     parsed_args = parser.parse_args(cmd_args[1:])
     self.dbpath = parsed_args.dbpath
     self.filterpath = parsed_args.filterpath
     self.filtername = parsed_args.filtername
     self.filterstring = parsed_args.filterstring
     self.savepath = parsed_args.savepath
     self.confpath = parsed_args.confpath
     self.conf = ConfigLoader(job_conf_path=self.confpath)
Ejemplo n.º 8
0
 def __init__(self, cmd_args, status_writer, live=False):
     self.live = live
     self.t = time.time()
     '''
     if live:
         self.live = live
         self.cmd_args = SimpleNamespace()
         self.cmd_args.include_sources = []
         self.cmd_args.exclude_sources = []
         self.input_path = ''
         self._setup_logger()
         return
     '''
     self.status_writer = status_writer
     main_fpath = cmd_args[0]
     main_basename = os.path.basename(main_fpath)
     if '.' in main_basename:
         self.module_name = '.'.join(main_basename.split('.')[:-1])
     else:
         self.module_name = main_basename
     self.module_dir = os.path.dirname(main_fpath)
     self.mapper_dir = os.path.dirname(main_fpath)
     self.cmd_parser = None
     self.cmd_args = None
     self.input_path = None
     self.input_dir = None
     self.reader = None
     self.output_dir = None
     self.output_base_fname = None
     self.crx_path = None
     self.crg_path = None
     self.crt_path = None
     self.crx_writer = None
     self.crg_writer = None
     self.crt_writer = None
     self.gene_sources = []
     #self.primary_gene_source = None
     self.gene_info = {}
     #self.written_primary_transc = set([])
     self._define_main_cmd_args()
     self._define_additional_cmd_args()
     self._parse_cmd_args(cmd_args)
     self._setup_logger()
     config_loader = ConfigLoader()
     self.conf = config_loader.get_module_conf(self.module_name)
     self.cravat_version = pkg_resources.get_distribution(
         'open-cravat').version
Ejemplo n.º 9
0
    def __init__(self, cmd_args, status_writer):
        try:
            self.status_writer = status_writer
            self.logger = None
            main_fpath = cmd_args[0]
            main_basename = os.path.basename(main_fpath)
            if '.' in main_basename:
                self.annotator_name = '.'.join(main_basename.split('.')[:-1])
            else:
                self.annotator_name = main_basename
            self.annotator_dir = os.path.dirname(main_fpath)
            self.data_dir = os.path.join(self.annotator_dir, 'data')

            # Load command line opts
            self.primary_input_path = None
            self.secondary_paths = None
            self.output_dir = None
            self.output_basename = None
            self.plain_output = None
            self.job_conf_path = None
            self.parse_cmd_args(cmd_args)
            # Make output dir if it doesn't exist
            if not (os.path.exists(self.output_dir)):
                os.makedirs(self.output_dir)

            self._setup_logger()
            config_loader = ConfigLoader(self.job_conf_path)
            self.conf = config_loader.get_module_conf(self.annotator_name)
            self._verify_conf()
            self._id_col_name = self.conf['output_columns'][0]['name']
            if 'logging_level' in self.conf:
                self.logger.setLevel(self.conf['logging_level'].upper())
            if 'title' in self.conf:
                self.annotator_display_name = self.conf['title']
            else:
                self.annotator_display_name = os.path.basename(
                    self.annotator_dir).upper()
            if 'version' in self.conf:
                self.annotator_version = self.conf['version']
            else:
                self.annotator_version = ''
            self.dbconn = None
            self.cursor = None
        except Exception as e:
            self._log_exception(e)
Ejemplo n.º 10
0
    def __init__(self, cmd_args):
        try:
            self.logger = None
            main_fpath = os.path.abspath(sys.modules[self.__module__].__file__)
            main_basename = os.path.basename(main_fpath)
            if "." in main_basename:
                self.annotator_name = ".".join(main_basename.split(".")[:-1])
            else:
                self.annotator_name = main_basename
            self.annotator_dir = os.path.dirname(main_fpath)
            self.data_dir = os.path.join(self.annotator_dir, "data")

            # Load command line opts
            self.primary_input_path = None
            self.secondary_paths = None
            self.output_dir = None
            self.output_basename = None
            self.plain_output = None
            self.job_conf_path = None
            self.parse_cmd_args(cmd_args)
            # Make output dir if it doesn't exist
            if not (os.path.exists(self.output_dir)):
                os.makedirs(self.output_dir)

            self._setup_logger()
            config_loader = ConfigLoader(self.job_conf_path)
            self.conf = config_loader.get_module_conf(self.annotator_name)
            self._verify_conf()
            self._id_col_name = self.conf["output_columns"][0]["name"]
            if "logging_level" in self.conf:
                self.logger.setLevel(self.conf["logging_level"].upper())
            if "title" in self.conf:
                self.annotator_display_name = self.conf["title"]
            else:
                self.annotator_display_name = os.path.basename(
                    self.annotator_dir
                ).upper()
            if "version" in self.conf:
                self.annotator_version = self.conf["version"]
            self.logger.info("Initialized %s" % self.annotator_name)

            self.dbconn = None
            self.cursor = None
        except Exception as e:
            self._log_exception(e)
Ejemplo n.º 11
0
 def __init__(self, *inargs, **inkwargs):
     self.cmd_parser = None
     self.input_path = None
     self.input_dir = None
     self.reader = None
     self.output_dir = None
     self.output_base_fname = None
     self.crx_path = None
     self.crg_path = None
     self.crt_path = None
     self.crx_writer = None
     self.crg_writer = None
     self.crt_writer = None
     self._define_main_cmd_args()
     self._define_additional_cmd_args()
     self._parse_cmd_args(inargs, inkwargs)
     if hasattr(self.args, "status_writer") == False:
         status_writer = None
     else:
         status_writer = self.args.status_writer
     if hasattr(self.args, "live") == False:
         live = False
     else:
         live = self.args.live
     self.live = live
     self.t = time.time()
     self.status_writer = status_writer
     main_fpath = self.args.script_path
     main_basename = os.path.basename(main_fpath)
     if "." in main_basename:
         self.module_name = ".".join(main_basename.split(".")[:-1])
     else:
         self.module_name = main_basename
     self.module_dir = os.path.dirname(main_fpath)
     self.mapper_dir = os.path.dirname(main_fpath)
     self.gene_sources = []
     # self.primary_gene_source = None
     self.gene_info = {}
     # self.written_primary_transc = set([])
     self._setup_logger()
     config_loader = ConfigLoader()
     self.conf = config_loader.get_module_conf(self.module_name)
     self.cravat_version = pkg_resources.get_distribution("open-cravat").version
Ejemplo n.º 12
0
 def parse_cmd_args(self, parser, cmd_args):
     cmd_args = clean_args(cmd_args)
     parsed_args = parser.parse_args(cmd_args)
     self.parsed_args = parsed_args
     self.dbpath = parsed_args.dbpath
     self.filterpath = parsed_args.filterpath
     self.filtername = parsed_args.filtername
     self.filterstring = parsed_args.filterstring
     self.confs = None
     if parsed_args.confs is not None:
         confs = parsed_args.confs.lstrip('\'').rstrip('\'').replace(
             "'", '"')
         self.confs = json.loads(confs)
         if 'filter' in self.confs:
             self.filter = self.confs['filter']
         else:
             self.filter = None
     if parsed_args.output_dir is not None:
         self.output_dir = parsed_args.output_dir
     else:
         self.output_dir = os.path.dirname(self.dbpath)
     self.savepath = parsed_args.savepath
     if self.savepath is not None and os.path.dirname(self.savepath) == '':
         self.savepath = os.path.join(self.output_dir, self.savepath)
     self.confpath = parsed_args.confpath
     self.conf = ConfigLoader(job_conf_path=self.confpath)
     self.module_name = parsed_args.module_name
     if self.conf is not None:
         self.module_conf = self.conf.get_module_conf(self.module_name)
     else:
         self.module_conf = None
     if hasattr(parsed_args, 'reporttypes'):
         self.report_types = parsed_args.reporttypes
     self.output_basename = os.path.basename(self.dbpath)[:-7]
     status_fname = '{}.status.json'.format(self.output_basename)
     self.status_fpath = os.path.join(self.output_dir, status_fname)
     self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel
     self.args = parsed_args
Ejemplo n.º 13
0
class Reporter(CravatReport):
    def __init__(self, cmd_args, status_writer):
        self.no_log = True
        self.no_status_update = True
        super().__init__(cmd_args, status_writer)

    def setup(self):
        self.data = {}
        self.keep_json_all_mapping = True
        self.conf = ConfigLoader()
        cravat_conf = self.conf.get_cravat_conf()
        if 'viewer_effective_digits' in cravat_conf:
            self.viewer_effective_digits = cravat_conf[
                'viewer_effective_digits']
        else:
            self.viewer_effective_digits = constants.viewer_effective_digits

    def write_preface(self, level):
        self.data[level] = []
        self.table = self.data[level]
        self.level = level

    def write_table_row(self, row):
        row = self.substitute_val(self.level, row)
        for i in range(len(row)):
            cell = row[i]
            if type(cell) == float:
                row[i] = round(cell, self.viewer_effective_digits)
        self.table.append(list(row))

    def end(self):
        info = {}
        info['norows'] = len(self.data[self.level])
        self.data['info'] = info
        self.data['colinfo'] = self.colinfo
        self.data['warning_msgs'] = self.warning_msgs
        return self.data
Ejemplo n.º 14
0
class CravatReport:
    def __init__(self, cmd_args, status_writer):
        self.status_writer = status_writer
        self.parse_cmd_args(cmd_args)
        self.cursor = None
        self.cf = None
        self.filtertable = 'filter'
        self.colinfo = {}
        self.colnos = {}
        self.ord_cols = {}
        self.var_added_cols = []
        self.summarizing_modules = []
        self.columngroups = {}
        self.column_subs = {}
        self._setup_logger()

    async def prep(self):
        await self.connect_db()
        await self.load_filter()

    def _setup_logger(self):
        if hasattr(self, 'no_log') and self.no_log:
            return
        try:
            self.logger = logging.getLogger('cravat.' + self.module_name)
        except Exception as e:
            self._log_exception(e)
        self.error_logger = logging.getLogger('error.' + self.module_name)
        self.unique_excs = []

    def _log_exception(self, e, halt=True):
        if halt:
            raise e
        else:
            if self.logger:
                self.logger.exception(e)

    async def getjson(self, level):
        ret = None
        if await self.table_exists(level) == False:
            return ret
        for row in await self.cf.getiterator(level):
            row = self.substitute_val(level, row)
            return json.dumps(row)

    def substitute_val(self, level, row):
        if level in self.column_subs:
            column_sub_level = self.column_subs[level]
            for i in self.column_subs[level]:
                column_sub_i = column_sub_level[i]
                value = row[i]
                if value is not None:
                    if value in column_sub_i:
                        row[i] = column_sub_i[value]
        return row

    async def run_level(self, level):
        if await self.table_exists(level):
            if level == 'variant':
                await self.cf.make_filtered_uid_table()
            elif level == 'gene':
                await self.cf.make_filtered_hugo_table()
                gene_summary_datas = {}
                for mi, o, cols in self.summarizing_modules:
                    gene_summary_data = await o.get_gene_summary_data(self.cf)
                    gene_summary_datas[mi.name] = [gene_summary_data, cols]
            self.write_preface(level)
            self.write_header(level)
            if level == 'variant':
                hugo_present = 'base__hugo' in self.colnos['variant']
            for row in await self.cf.get_filtered_iterator(level):
                row = list(row)
                if level == 'variant':
                    if hugo_present:
                        hugo = row[self.colnos['variant']['base__hugo']]
                        generow = await self.cf.get_gene_row(hugo)
                        for colname in self.var_added_cols:
                            if generow == None:
                                colval = None
                            else:
                                colval = generow[self.colnos['gene'][colname]]
                            row.append(colval)
                elif level == 'gene':
                    hugo = row[0]
                    for mi, _, _ in self.summarizing_modules:
                        module_name = mi.name
                        [gene_summary_data,
                         cols] = gene_summary_datas[module_name]
                        if hugo in gene_summary_data:
                            row.extend([
                                gene_summary_data[hugo][col['name']]
                                for col in cols
                            ])
                        else:
                            row.extend([None for v in cols])
                row = self.substitute_val(level, row)
                if hasattr(self, 'keep_json_all_mapping'
                           ) == False and level == 'variant':
                    colno = self.colnos['variant']['base__all_mappings']
                    all_map = json.loads(row[colno])
                    newvals = []
                    for hugo in all_map:
                        for maprow in all_map[hugo]:
                            [protid, protchange, so, transcript,
                             rnachange] = maprow
                            if protid == None:
                                protid = '(na)'
                            if protchange == None:
                                protchange = '(na)'
                            if rnachange == None:
                                rnachange = '(na)'
                            newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange
                            newvals.append(newval)
                    newvals.sort()
                    newcell = '; '.join(newvals)
                    row[colno] = newcell
                newrow = []
                for colname in self.ord_cols[level]:
                    colno = self.colnos[level][colname]
                    value = row[colno]
                    newrow.append(value)
                self.write_table_row(newrow)

    async def run(self, tab='all'):
        start_time = time.time()
        if not (hasattr(self, 'no_log') and self.no_log):
            self.logger.info('started: %s' %
                             time.asctime(time.localtime(start_time)))
        if self.module_conf is not None:
            self.status_writer.queue_status_update(
                'status', 'Started {} ({})'.format(self.module_conf['title'],
                                                   self.module_name))
        self.setup()
        if tab == 'all':
            for level in await self.cf.get_result_levels():
                if await self.table_exists(level):
                    await self.make_col_info(level)
            for level in await self.cf.get_result_levels():
                if await self.table_exists(level):
                    await self.run_level(level)
        else:
            if tab in ['variant', 'gene']:
                for level in ['variant', 'gene']:
                    if await self.table_exists(level):
                        await self.make_col_info(level)
            else:
                await self.make_col_info(tab)
            await self.run_level(tab)
        if self.module_conf is not None:
            self.status_writer.queue_status_update(
                'status', 'Finished {} ({})'.format(self.module_conf['title'],
                                                    self.module_name))
        end_time = time.time()
        if not (hasattr(self, 'no_log') and self.no_log):
            self.logger.info('finished: {0}'.format(
                time.asctime(time.localtime(end_time))))
            run_time = end_time - start_time
            self.logger.info('runtime: {0:0.3f}'.format(run_time))
        ret = self.end()
        return ret

    async def get_variant_colinfo(self):
        self.setup()
        level = 'variant'
        if await self.table_exists(level):
            await self.make_col_info(level)
        level = 'gene'
        if await self.table_exists(level):
            await self.make_col_info(level)
        return self.colinfo

    def setup(self):
        pass

    def end(self):
        pass

    def write_preface(self, level):
        pass

    def write_header(self, level):
        pass

    def write_table_row(self, row):
        pass

    async def make_col_info(self, level):
        cravat_conf = self.conf.get_cravat_conf()
        if 'report_module_order' in cravat_conf:
            priority_colgroups = cravat_conf['report_module_order']
        else:
            priority_colgroups = ['base', 'hg19', 'hg18', 'tagsampler']
        # ordered column groups
        self.columngroups[level] = []
        sql = 'select name, displayname from ' + level + '_annotator'
        await self.cursor.execute(sql)
        rows = await self.cursor.fetchall()
        for priority_colgroup in priority_colgroups:
            for row in rows:
                colgroup = row[0]
                if colgroup == priority_colgroup:
                    (name, displayname) = row
                    self.columngroups[level].append({
                        'name': name,
                        'displayname': displayname,
                        'count': 0
                    })
        for row in rows:
            colgroup = row[0]
            if colgroup in priority_colgroups:
                pass
            else:
                (name, displayname) = row
                self.columngroups[level].append({
                    'name': name,
                    'displayname': displayname,
                    'count': 0
                })
        # ordered column names
        sql = 'select * from ' + level + '_header'
        await self.cursor.execute(sql)
        columns = []
        unordered_rows = await self.cursor.fetchall()
        rows = []
        self.ord_cols[level] = []
        for group in priority_colgroups:
            for row in unordered_rows:
                [col_group, col_name] = row[0].split('__')
                if col_group == group:
                    rows.append(row)
                    self.ord_cols[level].append(row[0])
        for row in unordered_rows:
            [col_group, col_name] = row[0].split('__')
            if col_group not in priority_colgroups:
                rows.append(row)
                self.ord_cols[level].append(row[0])
        # unordered column numbers
        self.colnos[level] = {}
        colcount = 0
        for row in unordered_rows:
            self.colnos[level][row[0]] = colcount
            colcount += 1
        # ordered column details
        for row in rows:
            (colname, coltitle, col_type) = row[:3]
            col_cats = json.loads(row[3]) if len(row) > 3 and row[3] else []
            col_width = row[4] if len(row) > 4 else None
            col_desc = row[5] if len(row) > 5 else None
            col_hidden = bool(row[6]) if len(row) > 6 else False
            col_ctg = row[7] if len(row) > 7 else None
            if col_ctg in ['single', 'multi'] and len(col_cats) == 0:
                sql = 'select distinct {} from {}'.format(colname, level)
                await self.cursor.execute(sql)
                rs = await self.cursor.fetchall()
                for r in rs:
                    col_cats.append(r[0])
            col_filterable = bool(row[8]) if len(row) > 8 else True
            link_format = row[9] if len(row) > 9 else None
            column = {
                'col_name': colname,
                'col_title': coltitle,
                'col_type': col_type,
                'col_cats': col_cats,
                'col_width': col_width,
                'col_desc': col_desc,
                'col_hidden': col_hidden,
                'col_ctg': col_ctg,
                'col_filterable': col_filterable,
                'link_format': link_format,
            }
            columns.append(column)
            groupname = colname.split('__')[0]
            for columngroup in self.columngroups[level]:
                if columngroup['name'] == groupname:
                    columngroup['count'] += 1
        if level == 'variant' and await self.table_exists('gene'):
            modules_to_add = []
            q = 'select name from gene_annotator'
            await self.cursor.execute(q)
            gene_annotators = [v[0] for v in await self.cursor.fetchall()]
            k = 'add_gene_module_to_variant'
            if self.conf.has_key(k):
                modules_to_add = self.conf.get_val(k)
            for module in gene_annotators:
                module_info = au.get_local_module_info(module)
                if module_info == None:
                    continue
                module_conf = module_info.conf
                if 'add_to_variant_level' in module_conf:
                    if module_conf['add_to_variant_level'] == True:
                        modules_to_add.append(module)
            for module in modules_to_add:
                if not module in gene_annotators:
                    continue
                mi = au.get_local_module_info(module)
                cols = mi.conf['output_columns']
                self.columngroups[level].append({
                    'name': mi.name,
                    'displayname': mi.title,
                    'count': len(cols)
                })
                for col in cols:
                    colname = mi.name + '__' + col['name']
                    self.colnos[level][colname] = colcount
                    self.ord_cols[level].append(colname)
                    colcount += 1
                    col_type = col['type']
                    col_cats = col.get('categories', [])
                    col_width = col.get('width')
                    col_desc = col.get('desc')
                    col_hidden = col.get('hidden', False)
                    col_ctg = col.get('category', None)
                    if col_ctg in ['category', 'multicategory'
                                   ] and len(col_cats) == 0:
                        sql = 'select distinct {} from {}'.format(
                            colname, level)
                        await self.cursor.execute(sql)
                        rs = await self.cursor.fetchall()
                        for r in rs:
                            col_cats.append(r[0])
                    col_filterable = col.get('filterable', True)
                    col_link_format = col.get('link_format')
                    column = {
                        'col_name': colname,
                        'col_title': col['title'],
                        'col_type': col_type,
                        'col_cats': col_cats,
                        'col_width': col_width,
                        'col_desc': col_desc,
                        'col_hidden': col_hidden,
                        'col_ctg': col_ctg,
                        'col_filterable': col_filterable,
                        'col_link_format': col_link_format,
                    }
                    columns.append(column)
                    self.var_added_cols.append(colname)
        # Gene level summary columns
        if level == 'gene':
            q = 'select name from variant_annotator'
            await self.cursor.execute(q)
            done_var_annotators = [v[0] for v in await self.cursor.fetchall()]
            self.summarizing_modules = []
            local_modules = au.get_local_module_infos_of_type('annotator')
            for module_name in local_modules:
                mi = local_modules[module_name]
                conf = mi.conf
                if 'can_summarize_by_gene' in conf and module_name in done_var_annotators:
                    sys.path = sys.path + [os.path.dirname(mi.script_path)]
                    annot_cls = util.load_class('CravatAnnotator',
                                                mi.script_path)
                    annot = annot_cls([mi.script_path, '__dummy__'], {})
                    cols = conf['gene_summary_output_columns']
                    for col in cols:
                        col['name'] = col['name']
                    columngroup = {}
                    columngroup['name'] = conf['name']
                    columngroup['displayname'] = conf['title']
                    columngroup['count'] = len(cols)
                    self.columngroups[level].append(columngroup)
                    for col in cols:
                        col_type = col['type']
                        col_cats = col.get('categories', [])
                        col_ctg = col.get('category', None)
                        if col_type in ['category', 'multicategory'
                                        ] and len(col_cats) == 0:
                            sql = 'select distinct {} from {}'.format(
                                colname, level)
                            await self.cursor.execute(sql)
                            rs = await self.cursor.fetchall()
                            for r in rs:
                                col_cats.append(r[0])
                        col_filterable = col.get('filterable', True)
                        col_link_format = col.get('link_format')
                        column = {
                            'col_name': conf['name'] + '__' + col['name'],
                            'col_title': col['title'],
                            'col_type': col_type,
                            'col_cats': col_cats,
                            'col_width': col.get('width'),
                            'col_desc': col.get('desc'),
                            'col_hidden': col.get('hidden', False),
                            'col_ctg': col_ctg,
                            'col_filterable': col_filterable,
                            'col_link_format': col_link_format,
                        }
                        columns.append(column)
                    self.summarizing_modules.append([mi, annot, cols])
                    for col in cols:
                        fullname = module_name + '__' + col['name']
                        self.ord_cols[level].append(fullname)
                        self.colnos[level][fullname] = len(self.colnos[level])
        colno = 0
        for colgroup in self.columngroups[level]:
            colno += colgroup['count']
            colgroup['lastcol'] = colno
        self.colinfo[level] = {
            'colgroups': self.columngroups[level],
            'columns': columns
        }
        # report substitution
        if level in ['variant', 'gene']:
            reportsubtable = level + '_reportsub'
            if await self.table_exists(reportsubtable):
                q = 'select * from {}'.format(reportsubtable)
                await self.cursor.execute(q)
                rs = await self.cursor.fetchall()
                self.report_substitution = {}
                for r in rs:
                    module = r[0]
                    sub = json.loads(r[1])
                    self.report_substitution[module] = sub
                self.column_subs[level] = {}
                columns = self.colinfo[level]['columns']
                for i in range(len(columns)):
                    column = columns[i]
                    [module, col] = column['col_name'].split('__')
                    if module in self.report_substitution:
                        sub = self.report_substitution[module]
                        if col in sub:
                            self.column_subs[level][i] = sub[col]
                            self.colinfo[level]['columns'][i][
                                'reportsub'] = sub[col]

    def parse_cmd_args(self, cmd_args):
        parser = argparse.ArgumentParser()
        parser.add_argument('dbpath', help='Path to aggregator output')
        parser.add_argument('-f',
                            dest='filterpath',
                            default=None,
                            help='Path to filter file')
        parser.add_argument(
            '-F',
            dest='filtername',
            default=None,
            help='Name of filter (stored in aggregator output)')
        parser.add_argument('--filterstring',
                            dest='filterstring',
                            default=None,
                            help='Filter in JSON')
        parser.add_argument('-s',
                            dest='savepath',
                            default=None,
                            help='Path to save file')
        parser.add_argument('-c', dest='confpath', help='path to a conf file')
        parser.add_argument('-t',
                            dest='reporttypes',
                            nargs='+',
                            default=None,
                            help='report types')
        parser.add_argument('--module-name',
                            dest='module_name',
                            default=None,
                            help='report module name')
        parsed_args = parser.parse_args(cmd_args[1:])
        self.parsed_args = parsed_args
        self.dbpath = parsed_args.dbpath
        self.filterpath = parsed_args.filterpath
        self.filtername = parsed_args.filtername
        self.filterstring = parsed_args.filterstring
        self.savepath = parsed_args.savepath
        self.confpath = parsed_args.confpath
        self.conf = ConfigLoader(job_conf_path=self.confpath)
        self.module_name = parsed_args.module_name
        if self.conf is not None:
            self.module_conf = self.conf.get_module_conf(self.module_name)
        else:
            self.module_conf = None
        self.report_types = parsed_args.reporttypes
        self.output_basename = os.path.basename(self.dbpath)[:-7]
        self.output_dir = os.path.dirname(self.dbpath)
        status_fname = '{}.status.json'.format(self.output_basename)
        self.status_fpath = os.path.join(self.output_dir, status_fname)

    async def connect_db(self, dbpath=None):
        if dbpath != None:
            self.dbpath = dbpath
        if self.dbpath == None:
            sys.stderr.write('Provide a path to aggregator output')
            exit()
        if os.path.exists(self.dbpath) == False:
            sys.stderr.write(self.dbpath + ' does not exist.')
            exit()
        self.conn = await aiosqlite3.connect(self.dbpath)
        self.cursor = await self.conn.cursor()

    async def load_filter(self):
        self.cf = await CravatFilter.create(dbpath=self.dbpath)
        await self.cf.loadfilter(filterpath=self.filterpath,
                                 filtername=self.filtername,
                                 filterstring=self.filterstring)

    async def table_exists(self, tablename):
        sql = 'select name from sqlite_master where ' + \
            'type="table" and name="' + tablename + '"'
        await self.cursor.execute(sql)
        row = await self.cursor.fetchone()
        if row == None:
            ret = False
        else:
            ret = True
        return ret
Ejemplo n.º 15
0
 def parse_cmd_args(self, inargs, inkwargs):
     parsed_args = cravat.util.get_args(parser, inargs, inkwargs)
     self.parsed_args = parsed_args
     if parsed_args.md is not None:
         constants.custom_modules_dir = parsed_args.md
     self.dbpath = parsed_args.dbpath
     self.filterpath = parsed_args.filterpath
     self.filtername = parsed_args.filtername
     self.filterstring = parsed_args.filterstring
     self.filtersql = parsed_args.filtersql
     self.filter = parsed_args.filter
     self.confs = {}
     if parsed_args.output_dir is not None:
         self.output_dir = parsed_args.output_dir
     else:
         self.output_dir = os.path.dirname(self.dbpath)
     self.savepath = parsed_args.savepath
     if self.savepath is not None and os.path.dirname(self.savepath) == "":
         self.savepath = os.path.join(self.output_dir, self.savepath)
     self.confpath = parsed_args.confpath
     self.conf = ConfigLoader(job_conf_path=self.confpath)
     self.module_name = parsed_args.module_name
     if self.module_name in self.conf._all:
         self.confs.update(self.conf._all[self.module_name])
     if self.conf is not None:
         self.module_conf = self.conf.get_module_conf(self.module_name)
     else:
         self.module_conf = None
     if hasattr(parsed_args, "reporttypes"):
         self.report_types = parsed_args.reporttypes
     if hasattr(parsed_args, "conf") and parsed_args.conf is not None:
         self.confs.update(parsed_args.conf)
     if parsed_args.confs is not None:
         confs = parsed_args.confs.lstrip("'").rstrip("'").replace("'", '"')
         if self.confs is None:
             self.confs = json.loads(confs)
         else:
             self.confs.update(json.loads(confs))
     # Chooses filter.
     if self.filter is None:
         if self.confs is not None and "filter" in self.confs:
             self.filter = self.confs["filter"]
         local = au.mic.get_local()
         if (self.filter is None and self.filterpath is None
                 and self.filtername is None and self.filterstring is None
                 and parsed_args.package is not None
                 and parsed_args.package in local
                 and "filter" in local[parsed_args.package].conf):
             self.filter = local[parsed_args.package].conf["filter"]
     self.output_basename = os.path.basename(self.dbpath)[:-7]
     status_fname = "{}.status.json".format(self.output_basename)
     self.status_fpath = os.path.join(self.output_dir, status_fname)
     self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel
     if parsed_args.inputfiles is None and parsed_args.dbpath is not None:
         db = sqlite3.connect(parsed_args.dbpath)
         c = db.cursor()
         q = 'select colval from info where colkey="_input_paths"'
         c.execute(q)
         r = c.fetchone()
         if r is not None:
             parsed_args.inputfiles = []
             s = r[0]
             if " " in s:
                 s = s.replace("'", '"')
             s = s.replace("\\", "\\\\\\\\")
             s = json.loads(s)
             for k in s:
                 input_path = s[k]
                 parsed_args.inputfiles.append(input_path)
         c.close()
         db.close()
     if hasattr(parsed_args, "status_writer"):
         self.status_writer = parsed_args.status_writer
     else:
         self.status_writer = None
     self.concise_report = parsed_args.concise_report
     self.extract_columns_multilevel = self.get_standardized_module_option(
         self.confs.get("extract-columns", {}))
     self.args = parsed_args
Ejemplo n.º 16
0
class CravatReport:
    def __init__(self, *inargs, **inkwargs):
        self.cf = None
        self.filtertable = "filter"
        self.colinfo = {}
        self.colnos = {}
        self.newcolnos = {}
        self.var_added_cols = []
        self.summarizing_modules = []
        self.columngroups = {}
        self.column_subs = {}
        self.column_sub_allow_partial_match = {}
        self.colname_conversion = {}
        self.warning_msgs = []
        self.colnames_to_display = {}
        self.colnos_to_display = {}
        self.display_select_columns = {}
        self.extracted_cols = {}
        self.conn = None
        self.levels_to_write = None
        self.parse_cmd_args(inargs, inkwargs)
        global parser
        for ag in parser._action_groups:
            if ag.title == "optional arguments":
                for a in ag._actions:
                    if "-t" in a.option_strings:
                        ag._actions.remove(a)
        self._setup_logger()

    def parse_cmd_args(self, inargs, inkwargs):
        parsed_args = cravat.util.get_args(parser, inargs, inkwargs)
        self.parsed_args = parsed_args
        if parsed_args.md is not None:
            constants.custom_modules_dir = parsed_args.md
        self.dbpath = parsed_args.dbpath
        self.filterpath = parsed_args.filterpath
        self.filtername = parsed_args.filtername
        self.filterstring = parsed_args.filterstring
        self.filtersql = parsed_args.filtersql
        self.filter = parsed_args.filter
        self.confs = {}
        if parsed_args.output_dir is not None:
            self.output_dir = parsed_args.output_dir
        else:
            self.output_dir = os.path.dirname(self.dbpath)
        self.savepath = parsed_args.savepath
        if self.savepath is not None and os.path.dirname(self.savepath) == "":
            self.savepath = os.path.join(self.output_dir, self.savepath)
        self.confpath = parsed_args.confpath
        self.conf = ConfigLoader(job_conf_path=self.confpath)
        self.module_name = parsed_args.module_name
        if self.module_name in self.conf._all:
            self.confs.update(self.conf._all[self.module_name])
        if self.conf is not None:
            self.module_conf = self.conf.get_module_conf(self.module_name)
        else:
            self.module_conf = None
        if hasattr(parsed_args, "reporttypes"):
            self.report_types = parsed_args.reporttypes
        if hasattr(parsed_args, "conf") and parsed_args.conf is not None:
            self.confs.update(parsed_args.conf)
        if parsed_args.confs is not None:
            confs = parsed_args.confs.lstrip("'").rstrip("'").replace("'", '"')
            if self.confs is None:
                self.confs = json.loads(confs)
            else:
                self.confs.update(json.loads(confs))
        # Chooses filter.
        if self.filter is None:
            if self.confs is not None and "filter" in self.confs:
                self.filter = self.confs["filter"]
            local = au.mic.get_local()
            if (self.filter is None and self.filterpath is None
                    and self.filtername is None and self.filterstring is None
                    and parsed_args.package is not None
                    and parsed_args.package in local
                    and "filter" in local[parsed_args.package].conf):
                self.filter = local[parsed_args.package].conf["filter"]
        self.output_basename = os.path.basename(self.dbpath)[:-7]
        status_fname = "{}.status.json".format(self.output_basename)
        self.status_fpath = os.path.join(self.output_dir, status_fname)
        self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel
        if parsed_args.inputfiles is None and parsed_args.dbpath is not None:
            db = sqlite3.connect(parsed_args.dbpath)
            c = db.cursor()
            q = 'select colval from info where colkey="_input_paths"'
            c.execute(q)
            r = c.fetchone()
            if r is not None:
                parsed_args.inputfiles = []
                s = r[0]
                if " " in s:
                    s = s.replace("'", '"')
                s = s.replace("\\", "\\\\\\\\")
                s = json.loads(s)
                for k in s:
                    input_path = s[k]
                    parsed_args.inputfiles.append(input_path)
            c.close()
            db.close()
        if hasattr(parsed_args, "status_writer"):
            self.status_writer = parsed_args.status_writer
        else:
            self.status_writer = None
        self.concise_report = parsed_args.concise_report
        self.extract_columns_multilevel = self.get_standardized_module_option(
            self.confs.get("extract-columns", {}))
        self.args = parsed_args

    def should_write_level(self, level):
        if self.levels_to_write is None:
            return True
        elif level in self.levels_to_write:
            return True
        else:
            return False

    async def prep(self):
        try:
            await self.connect_db()
            await self.load_filter()
        except Exception as e:
            if hasattr(self, "cf"):
                await self.cf.close_db()
            if not hasattr(e, "notraceback") or e.notraceback != True:
                import traceback
                traceback.print_exc()
                self.logger.error(e)
            else:
                if hasattr(self, "logger"):
                    write_log_msg(self.logger, e)
            e.handled = True
            raise

    def _setup_logger(self):
        if hasattr(self, "no_log") and self.no_log:
            return
        try:
            self.logger = logging.getLogger("cravat." + self.module_name)
        except Exception as e:
            self._log_exception(e)
        self.error_logger = logging.getLogger("error." + self.module_name)
        self.unique_excs = []

    async def get_db_conn(self):
        if self.dbpath is None:
            return None
        if self.conn is None:
            self.conn = await aiosqlite.connect(self.dbpath)
        return self.conn

    async def exec_db(self, func, *args, **kwargs):
        conn = await self.get_db_conn()
        cursor = await conn.cursor()
        try:
            ret = await func(*args, conn=conn, cursor=cursor, **kwargs)
        except:
            await cursor.close()
            raise
        await cursor.close()
        return ret

    def _log_exception(self, e, halt=True):
        if halt:
            raise e
        else:
            if self.logger:
                self.logger.exception(e)

    async def getjson(self, level):
        ret = None
        if await self.exec_db(self.table_exists, level) == False:
            return ret
        for row in await self.cf.exec_db(self.cf.getiterator, level):
            row = self.substitute_val(level, row)
            return json.dumps(row)

    def substitute_val(self, level, row):
        for sub in self.column_subs.get(level, []):
            value = row[sub.index]
            if value is None or value == "":
                continue
            if (level == "variant" and sub.module == "base"
                    and sub.col == "all_mappings"):
                mappings = json.loads(row[sub.index])
                for gene in mappings:
                    for i in range(len(mappings[gene])):
                        sos = mappings[gene][i][2].split(",")
                        sos = [sub.subs.get(so, so) for so in sos]
                        mappings[gene][i][2] = ",".join(sos)
                value = json.dumps(mappings)
            elif level == "gene" and sub.module == "base" and sub.col == "all_so":
                vals = []
                for i, so_count in enumerate(value.split(",")):
                    so = so_count[:3]
                    so = sub.subs.get(so, so)
                    so_count = so + so_count[3:]
                    vals.append(so_count)
                value = ",".join(vals)
            else:
                value = sub.subs.get(value, value)
            row[sub.index] = value
        return row

    def process_datarow(self, args):
        datarow = args[0]
        should_skip_some_cols = args[1]
        level = args[2]
        gene_summary_datas = args[3]
        if datarow is None:
            return None
        datarow = list(datarow)
        if should_skip_some_cols:
            datarow = [
                datarow[colno] for colno in range(num_total_cols)
                if colno not in colnos_to_skip
            ]
        if level == "variant":
            # adds gene level data to variant level.
            if self.nogenelevelonvariantlevel == False and hugo_present:
                hugo = datarow[self.colnos["variant"]["base__hugo"]]
                loop = asyncio.get_event_loop()
                future = asyncio.ensure_future(self.cf.get_gene_row(hugo),
                                               loop)
                generow = future.result()
                if generow is None:
                    datarow.extend(
                        [None for i in range(len(self.var_added_cols))])
                else:
                    datarow.extend([
                        generow[self.colnos["gene"][colname]]
                        for colname in self.var_added_cols
                    ])
        elif level == "gene":
            # adds summary data to gene level.
            hugo = datarow[0]
            for mi, _, _ in self.summarizing_modules:
                module_name = mi.name
                [gene_summary_data, cols] = gene_summary_datas[module_name]
                if (hugo in gene_summary_data
                        and gene_summary_data[hugo] is not None
                        and len(gene_summary_data[hugo]) == len(cols)):
                    datarow.extend(
                        [gene_summary_data[hugo][col["name"]] for col in cols])
                else:
                    datarow.extend([None for v in cols])
        # re-orders data row.
        new_datarow = []
        colnos = self.colnos[level]
        for colname in [
                col["col_name"] for col in self.colinfo[level]["columns"]
        ]:
            if colname in self.colname_conversion[level]:
                newcolname = self.colname_conversion[level][colname]
                if newcolname in colnos:
                    colno = colnos[newcolname]
                else:
                    self.logger.info(
                        "column name does not exist in data: {}".format(
                            colname))
                    continue
            else:
                colno = colnos[colname]
            value = datarow[colno]
            new_datarow.append(value)
        # does report substitution.
        new_datarow = self.substitute_val(level, new_datarow)
        if hasattr(self,
                   "keep_json_all_mapping") == False and level == "variant":
            colno = self.colnos["variant"]["base__all_mappings"]
            all_map = json.loads(new_datarow[colno])
            newvals = []
            for hugo in all_map:
                for maprow in all_map[hugo]:
                    [protid, protchange, so, transcript, rnachange] = maprow
                    if protid == None:
                        protid = "(na)"
                    if protchange == None:
                        protchange = "(na)"
                    if rnachange == None:
                        rnachange = "(na)"
                    newval = (transcript + ":" + hugo + ":" + protid + ":" +
                              so + ":" + protchange + ":" + rnachange)
                    newvals.append(newval)
            newvals.sort()
            newcell = "; ".join(newvals)
            new_datarow[colno] = newcell
        return new_datarow

    def get_extracted_header_columns(self, level):
        cols = []
        for col in self.colinfo[level]["columns"]:
            if col["col_name"] in self.colnames_to_display[level]:
                cols.append(col)
        return cols

    async def run_level(self, level):
        ret = await self.exec_db(self.table_exists, level)
        if ret == False:
            return
        if self.should_write_level(level) == False:
            return
        gene_summary_datas = {}
        if level == "variant":
            await self.cf.exec_db(self.cf.make_filtered_uid_table)
        elif level == "gene":
            await self.cf.exec_db(self.cf.make_filtered_hugo_table)
            for mi, o, cols in self.summarizing_modules:
                if hasattr(o, "build_gene_collection"):
                    msg = "Obsolete module [{}] for gene level summarization. Update the module to get correct gene level summarization.".format(
                        mi.name)
                    self.warning_msgs.append(msg)
                    if self.args.silent == False:
                        print("===Warning: {}".format(msg))
                    gene_summary_data = {}
                else:
                    gene_summary_data = await o.get_gene_summary_data(self.cf)
                gene_summary_datas[mi.name] = [gene_summary_data, cols]
                for col in cols:
                    if "category" in col and col["category"] in [
                            "single", "multi"
                    ]:
                        for i in range(len(self.colinfo[level]["columns"])):
                            colinfo_col = self.colinfo[level]["columns"][i]
                            if mi.name in ["hg38", "tagsampler"]:
                                grp_name = "base"
                            else:
                                grp_name = mi.name
                            if colinfo_col[
                                    "col_name"] == grp_name + "__" + col[
                                        "name"]:
                                break
                        cats = []
                        for hugo in gene_summary_data:
                            val = gene_summary_data[hugo][col["name"]]
                            if len(colinfo_col["reportsub"]) > 0:
                                if val in colinfo_col["reportsub"]:
                                    val = colinfo_col["reportsub"][val]
                            if val not in cats:
                                cats.append(val)
                        self.colinfo[level]["columns"][i]["col_cats"] = cats
        self.write_preface(level)
        self.extracted_cols[level] = self.get_extracted_header_columns(level)
        self.write_header(level)
        if level == "variant":
            hugo_present = "base__hugo" in self.colnos["variant"]
        datacols, datarows = await self.cf.exec_db(
            self.cf.get_filtered_iterator, level)
        num_total_cols = len(datacols)
        colnos_to_skip = []
        if level == "gene":
            for colno in range(len(datacols)):
                if datacols[colno] in constants.legacy_gene_level_cols_to_skip:
                    colnos_to_skip.append(colno)
        should_skip_some_cols = len(colnos_to_skip) > 0
        if level == "variant" and self.args.separatesample:
            write_variant_sample_separately = True
            sample_newcolno = self.newcolnos["variant"]["base__samples"]
        else:
            write_variant_sample_separately = False
        colnos = self.colnos[level]
        all_mappings_newcolno = self.newcolnos["variant"]["base__all_mappings"]
        cols = self.colinfo[level]["columns"]
        json_colnos = []
        for i in range(len(cols)):
            col = cols[i]
            if col["table"] == True:
                json_colnos.append(i)
        for datarow in datarows:
            if datarow is None:
                continue
            datarow = list(datarow)
            if should_skip_some_cols:
                datarow = [
                    datarow[colno] for colno in range(num_total_cols)
                    if colno not in colnos_to_skip
                ]
            if level == "variant":
                # adds gene level data to variant level.
                if self.nogenelevelonvariantlevel == False and hugo_present:
                    hugo = datarow[self.colnos["variant"]["base__hugo"]]
                    generow = await self.cf.get_gene_row(hugo)
                    if generow is None:
                        datarow.extend(
                            [None for i in range(len(self.var_added_cols))])
                    else:
                        datarow.extend([
                            generow[self.colnos["gene"][colname]]
                            for colname in self.var_added_cols
                        ])
            elif level == "gene":
                # adds summary data to gene level.
                hugo = datarow[0]
                for mi, _, _ in self.summarizing_modules:
                    module_name = mi.name
                    [gene_summary_data, cols] = gene_summary_datas[module_name]
                    if (hugo in gene_summary_data
                            and gene_summary_data[hugo] is not None
                            and len(gene_summary_data[hugo]) == len(cols)):
                        datarow.extend([
                            gene_summary_data[hugo][col["name"]]
                            for col in cols
                        ])
                    else:
                        datarow.extend([None for v in cols])
            # re-orders data row.
            new_datarow = []
            for colname in [
                    col["col_name"] for col in self.colinfo[level]["columns"]
            ]:
                if colname in self.colname_conversion[level]:
                    oldcolname = self.colname_conversion[level][colname]
                    if oldcolname in colnos:
                        colno = colnos[oldcolname]
                    else:
                        self.logger.info(
                            "column name does not exist in data: {}".format(
                                oldcolname))
                        continue
                else:
                    colno = colnos[colname]
                value = datarow[colno]
                new_datarow.append(value)
            # does report substitution.
            new_datarow = self.substitute_val(level, new_datarow)
            if hasattr(
                    self,
                    "keep_json_all_mapping") == False and level == "variant":
                all_map = json.loads(new_datarow[all_mappings_newcolno])
                newvals = []
                for hugo in all_map:
                    for maprow in all_map[hugo]:
                        [protid, protchange, so, transcript,
                         rnachange] = maprow
                        if protid == None:
                            protid = "(na)"
                        if protchange == None:
                            protchange = "(na)"
                        if rnachange == None:
                            rnachange = "(na)"
                        newval = (transcript + ":" + hugo + ":" + protid +
                                  ":" + so + ":" + protchange + ":" +
                                  rnachange)
                        newvals.append(newval)
                newvals.sort()
                newcell = "; ".join(newvals)
                new_datarow[all_mappings_newcolno] = newcell
            if write_variant_sample_separately:
                samples = new_datarow[sample_newcolno]
                if samples is not None:
                    samples = samples.split(";")
                    for sample in samples:
                        sample_datarow = new_datarow
                        sample_datarow[sample_newcolno] = sample
                        self.write_table_row(
                            self.get_extracted_row(sample_datarow))
                else:
                    self.write_table_row(self.get_extracted_row(new_datarow))
            else:
                self.write_table_row(self.get_extracted_row(new_datarow))

    async def store_mapper(self, conn=None, cursor=None):
        # conn = await self.get_db_conn()
        # cursor = await conn.cursor()
        q = 'select colval from info where colkey="_mapper"'
        await cursor.execute(q)
        r = await cursor.fetchone()
        if r is None:
            self.mapper_name = "hg38"
        else:
            self.mapper_name = r[0].split(":")[0]
        # await cursor.close()
        # await conn.close()

    async def run(self, tab="all"):
        try:
            start_time = time.time()
            if not (hasattr(self, "no_log") and self.no_log):
                self.logger.info("started: %s" %
                                 time.asctime(time.localtime(start_time)))
                if self.cf.filter:
                    s = f"filter:\n{yaml.dump(self.filter)}"
                    self.logger.info(s)
            if self.module_conf is not None and self.status_writer is not None:
                if self.parsed_args.do_not_change_status == False:
                    self.status_writer.queue_status_update(
                        "status",
                        "Started {} ({})".format(self.module_conf["title"],
                                                 self.module_name),
                    )
            if self.setup() == False:
                await self.close_db()
                return
            if tab == "all":
                for level in await self.cf.exec_db(self.cf.get_result_levels):
                    self.level = level
                    if await self.exec_db(self.table_exists, level):
                        await self.exec_db(self.make_col_info, level)
                for level in await self.cf.exec_db(self.cf.get_result_levels):
                    self.level = level
                    if await self.exec_db(self.table_exists, level):
                        await self.run_level(level)
            else:
                if tab in ["variant", "gene"]:
                    for level in ["variant", "gene"]:
                        if await self.exec_db(self.table_exists, level):
                            await self.exec_db(self.make_col_info, level)
                else:
                    await self.exec_db(self.make_col_info, tab)
                self.level = level
                await self.run_level(tab)
            await self.close_db()
            if self.module_conf is not None and self.status_writer is not None:
                if self.parsed_args.do_not_change_status == False:
                    self.status_writer.queue_status_update(
                        "status",
                        "Finished {} ({})".format(self.module_conf["title"],
                                                  self.module_name),
                    )
            end_time = time.time()
            if not (hasattr(self, "no_log") and self.no_log):
                self.logger.info("finished: {0}".format(
                    time.asctime(time.localtime(end_time))))
                run_time = end_time - start_time
                self.logger.info("runtime: {0:0.3f}".format(run_time))
            ret = self.end()
        except:
            await self.close_db()
            if self.module_conf is not None and self.status_writer is not None:
                if self.parsed_args.do_not_change_status == False:
                    self.status_writer.queue_status_update(
                        "status",
                        "Failed {} ({})".format(self.module_conf["title"],
                                                self.module_name),
                    )
            end_time = time.time()
            if not (hasattr(self, "no_log") and self.no_log):
                self.logger.info("finished: {0}".format(
                    time.asctime(time.localtime(end_time))))
                run_time = end_time - start_time
                self.logger.info("runtime: {0:0.3f}".format(run_time))
            raise
        return ret

    async def get_variant_colinfo(self):
        self.setup()
        level = "variant"
        if await self.exec_db(self.table_exists, level):
            await self.exec_db(self.make_col_info, level)
        level = "gene"
        if await self.exec_db(self.table_exists, level):
            await self.exec_db(self.make_col_info, level)
        return self.colinfo

    def setup(self):
        pass

    def end(self):
        pass

    def write_preface(self, level):
        pass

    def write_header(self, level):
        pass

    def write_table_row(self, row):
        pass

    def get_extracted_row(self, row):
        if self.display_select_columns[self.level]:
            filtered_row = [
                row[colno] for colno in self.colnos_to_display[self.level]
            ]
        else:
            filtered_row = row
        return filtered_row

    def add_conditional_to_colnames_to_display(self, level, column,
                                               module_name):
        col_name = column["col_name"]
        if (level in self.extract_columns_multilevel
                and len(self.extract_columns_multilevel[level]) > 0):
            if col_name in self.extract_columns_multilevel[level]:
                incl = True
            else:
                incl = False
        elif self.concise_report:
            if "col_hidden" in column and column["col_hidden"] == True:
                incl = False
            else:
                incl = True
        else:
            incl = True
        if incl and col_name not in self.colnames_to_display[level]:
            if module_name == self.mapper_name:
                self.colnames_to_display[level].append(
                    col_name.replace(module_name + "__", "base__"))
            elif module_name == "tagsampler":
                self.colnames_to_display[level].append(
                    col_name.replace(module_name + "__", "base__"))
            else:
                self.colnames_to_display[level].append(col_name)

    async def make_col_info(self, level, conn=None, cursor=None):
        self.colnames_to_display[level] = []
        await self.exec_db(self.store_mapper)
        cravat_conf = self.conf.get_cravat_conf()
        if "report_module_order" in cravat_conf:
            priority_colgroupnames = cravat_conf["report_module_order"]
        else:
            priority_colgroupnames = [
                "base", "hg38", "hg19", "hg18", "tagsampler"
            ]
        # level-specific column groups
        self.columngroups[level] = []
        sql = "select name, displayname from " + level + "_annotator"
        await cursor.execute(sql)
        rows = await cursor.fetchall()
        for row in rows:
            (name, displayname) = row
            self.columngroups[level].append({
                "name": name,
                "displayname": displayname,
                "count": 0
            })
        # level-specific column names
        header_table = level + "_header"
        coldefs = []
        sql = "select col_def from " + header_table
        await cursor.execute(sql)
        for row in await cursor.fetchall():
            coljson = row[0]
            coldef = ColumnDefinition({})
            coldef.from_json(coljson)
            coldefs.append(coldef)
        columns = []
        self.colnos[level] = {}
        colcount = 0
        # level-specific column details
        for coldef in coldefs:
            self.colnos[level][coldef.name] = colcount
            colcount += 1
            if coldef.category in ["single", "multi"] and len(
                    coldef.categories) == 0:
                sql = "select distinct {} from {}".format(coldef.name, level)
                await cursor.execute(sql)
                rs = await cursor.fetchall()
                for r in rs:
                    coldef.categories.append(r[0])
            [colgrpname, _] = coldef.name.split("__")
            column = coldef.get_colinfo()
            columns.append(column)
            self.add_conditional_to_colnames_to_display(
                level, column, colgrpname)
            for columngroup in self.columngroups[level]:
                if columngroup["name"] == colgrpname:
                    columngroup["count"] += 1
        # adds gene level columns to variant level.
        if (self.nogenelevelonvariantlevel == False and level == "variant"
                and await self.exec_db(self.table_exists, "gene")):
            modules_to_add = []
            q = "select name from gene_annotator"
            await cursor.execute(q)
            gene_annotators = [v[0] for v in await cursor.fetchall()]
            modules_to_add = [m for m in gene_annotators if m != "base"]
            for module in modules_to_add:
                cols = []
                q = 'select col_def from gene_header where col_name like "{}__%"'.format(
                    module)
                await cursor.execute(q)
                rs = await cursor.fetchall()
                for r in rs:
                    cd = ColumnDefinition({})
                    cd.from_json(r[0])
                    cols.append(cd)
                q = 'select displayname from gene_annotator where name="{}"'.format(
                    module)
                await cursor.execute(q)
                r = await cursor.fetchone()
                displayname = r[0]
                self.columngroups[level].append({
                    "name": module,
                    "displayname": displayname,
                    "count": len(cols)
                })
                for coldef in cols:
                    self.colnos[level][coldef.name] = colcount
                    colcount += 1
                    if (coldef.category in ["category", "multicategory"]
                            and len(coldef.categories) == 0):
                        sql = "select distinct {} from {}".format(
                            coldef.name, level)
                        await cursor.execute(sql)
                        rs = await cursor.fetchall()
                        for r in rs:
                            coldef.categories.append(r[0])
                    column = coldef.get_colinfo()
                    columns.append(column)
                    self.add_conditional_to_colnames_to_display(
                        level, column, module)
                    self.var_added_cols.append(coldef.name)
        # Gene level summary columns
        if level == "gene":
            q = "select name from variant_annotator"
            await cursor.execute(q)
            done_var_annotators = [v[0] for v in await cursor.fetchall()]
            self.summarizing_modules = []
            local_modules = au.get_local_module_infos_of_type("annotator")
            local_modules.update(
                au.get_local_module_infos_of_type("postaggregator"))
            summarizer_module_names = []
            for module_name in done_var_annotators:
                if module_name in [
                        "base",
                        "hg19",
                        "hg18",
                        "extra_vcf_info",
                        "extra_variant_info",
                ]:
                    continue
                if module_name not in local_modules:
                    if self.args.silent == False and module_name != 'original_input':
                        print(
                            "            [{}] module does not exist in the system. Gene level summary for this module is skipped."
                            .format(module_name))
                    continue
                module = local_modules[module_name]
                if "can_summarize_by_gene" in module.conf:
                    summarizer_module_names.append(module_name)
            local_modules[self.mapper_name] = au.get_local_module_info(
                self.mapper_name)
            summarizer_module_names = [self.mapper_name
                                       ] + summarizer_module_names
            for module_name in summarizer_module_names:
                mi = local_modules[module_name]
                sys.path = sys.path + [os.path.dirname(mi.script_path)]
                if module_name in done_var_annotators:
                    annot_cls = util.load_class(mi.script_path,
                                                "CravatAnnotator")
                elif module_name == self.mapper_name:
                    annot_cls = util.load_class(mi.script_path, "Mapper")
                cmd = {
                    "script_path": mi.script_path,
                    "input_file": "__dummy__",
                    "output_dir": self.output_dir,
                }
                annot = annot_cls(cmd)
                cols = mi.conf["gene_summary_output_columns"]
                columngroup = {
                    "name": mi.name,
                    "displayname": mi.title,
                    "count": len(cols),
                }
                self.columngroups[level].append(columngroup)
                for col in cols:
                    coldef = ColumnDefinition(col)
                    coldef.name = columngroup["name"] + "__" + coldef.name
                    coldef.genesummary = True
                    column = coldef.get_colinfo()
                    columns.append(column)
                    self.add_conditional_to_colnames_to_display(
                        level, column, mi.name)
                self.summarizing_modules.append([mi, annot, cols])
                for col in cols:
                    fullname = module_name + "__" + col["name"]
                    self.colnos[level][fullname] = len(self.colnos[level])
        # re-orders columns groups.
        colgrps = self.columngroups[level]
        newcolgrps = []
        for priority_colgrpname in priority_colgroupnames:
            for colgrp in colgrps:
                if colgrp["name"] == priority_colgrpname:
                    if colgrp["name"] in [self.mapper_name, "tagsampler"]:
                        newcolgrps[0]["count"] += colgrp["count"]
                    else:
                        newcolgrps.append(colgrp)
                    break
        colpos = 0
        for colgrp in newcolgrps:
            colgrp["lastcol"] = colpos + colgrp["count"]
            colpos = colgrp["lastcol"]
        colgrpnames = [
            v["displayname"] for v in colgrps
            if v["name"] not in priority_colgroupnames
        ]
        colgrpnames.sort()
        for colgrpname in colgrpnames:
            for colgrp in colgrps:
                if colgrp["displayname"] == colgrpname:
                    colgrp["lastcol"] = colpos + colgrp["count"]
                    newcolgrps.append(colgrp)
                    colpos += colgrp["count"]
                    break
        # re-orders columns.
        self.colname_conversion[level] = {}
        new_columns = []
        self.newcolnos[level] = {}
        newcolno = 0
        new_colnames_to_display = []
        for colgrp in newcolgrps:
            colgrpname = colgrp["name"]
            for col in columns:
                colname = col["col_name"]
                [grpname, _] = colname.split("__")
                if colgrpname == "base" and grpname in [
                        self.mapper_name, "tagsampler"
                ]:
                    newcolname = "base__" + colname.split("__")[1]
                    self.colname_conversion[level][newcolname] = colname
                    col["col_name"] = newcolname
                    new_columns.append(col)
                    self.newcolnos[level][newcolname] = newcolno
                    if newcolname in self.colnames_to_display[level]:
                        new_colnames_to_display.append(newcolname)
                elif grpname == colgrpname:
                    new_columns.append(col)
                    self.newcolnos[level][colname] = newcolno
                    if colname in self.colnames_to_display[level]:
                        new_colnames_to_display.append(colname)
                else:
                    continue
                newcolno += 1
        self.colinfo[level] = {"colgroups": newcolgrps, "columns": new_columns}
        self.colnames_to_display[level] = new_colnames_to_display
        # report substitution
        if level in ["variant", "gene"]:
            reportsubtable = level + "_reportsub"
            if await self.exec_db(self.table_exists, reportsubtable):
                q = "select * from {}".format(reportsubtable)
                await cursor.execute(q)
                reportsub = {
                    r[0]: json.loads(r[1])
                    for r in await cursor.fetchall()
                }
                self.column_subs[level] = []
                for i, column in enumerate(new_columns):
                    module, col = column["col_name"].split("__")
                    if module == self.mapper_name:
                        module = "base"
                    if module in reportsub and col in reportsub[module]:
                        self.column_subs[level].append(
                            SimpleNamespace(
                                module=module,
                                col=col,
                                index=i,
                                subs=reportsub[module][col],
                            ))
                        new_columns[i]["reportsub"] = reportsub[module][col]
        # display_select_columns
        if (level in self.extract_columns_multilevel
                and len(self.extract_columns_multilevel[level]) > 0
            ) or self.concise_report:
            self.display_select_columns[level] = True
        else:
            self.display_select_columns[level] = False
        # column numbers to display
        colno = 0
        self.colnos_to_display[level] = []
        for colgroup in self.colinfo[level]["colgroups"]:
            count = colgroup["count"]
            if count == 0:
                continue
            for col in self.colinfo[level]["columns"][colno:colno + count]:
                module_col_name = col["col_name"]
                if module_col_name in self.colnames_to_display[level]:
                    include_col = True
                else:
                    include_col = False
                if include_col:
                    self.colnos_to_display[level].append(colno)
                colno += 1

    def get_standardized_module_option(self, v):
        tv = type(v)
        if tv == str:
            if ":" in v:
                v0 = {}
                for v1 in v.split("."):
                    if ":" in v1:
                        v1toks = v1.split(":")
                        if len(v1toks) == 2:
                            level = v1toks[0]
                            v2s = v1toks[1].split(",")
                            v0[level] = v2s
                v = v0
            elif "," in v:
                v = [val for val in v.split(",") if val != ""]
        if v == "true":
            v = True
        elif v == "false":
            v = False
        return v

    async def connect_db(self, dbpath=None):
        if dbpath != None:
            self.dbpath = dbpath
        if self.dbpath == None:
            sys.stderr.write("Provide a path to aggregator output")
            exit()
        if os.path.exists(self.dbpath) == False:
            sys.stderr.write(self.dbpath + " does not exist.")
            exit()

    async def close_db(self):
        if hasattr(self, "conn") and self.conn is not None:
            await self.conn.close()
            self.conn = None
        if self.cf is not None:
            await self.cf.close_db()
            self.cf = None

    async def load_filter(self):
        self.cf = await CravatFilter.create(dbpath=self.dbpath)
        await self.cf.exec_db(self.cf.loadfilter,
                              filter=self.filter,
                              filterpath=self.filterpath,
                              filtername=self.filtername,
                              filterstring=self.filterstring,
                              filtersql=self.filtersql,
                              includesample=self.args.includesample,
                              excludesample=self.args.excludesample)

    async def table_exists(self, tablename, conn=None, cursor=None):
        sql = ("select name from sqlite_master where " +
               'type="table" and name="' + tablename + '"')
        await cursor.execute(sql)
        row = await cursor.fetchone()
        if row == None:
            ret = False
        else:
            ret = True
        return ret
Ejemplo n.º 17
0
import glob
import platform
import signal
import multiprocessing as mp
import asyncio
import importlib
from multiprocessing import Process, Pipe, Value, Manager, Queue
from queue import Empty
from cravat import constants
from cravat import get_live_annotator, get_live_mapper
import signal
import gzip
from cravat.cravat_util import max_version_supported_for_migration
import cravat.util

cfl = ConfigLoader()


class FileRouter(object):
    def __init__(self):
        self.root = os.path.dirname(__file__)
        self.input_fname = 'input'
        self.report_extensions = {
            'text': '.tsv',
            'excel': '.xlsx',
            'vcf': '.vcf'
        }
        self.db_extension = '.sqlite'
        self.log_extension = '.log'
        self.status_extension = '.status.json'
        self.job_statuses = {}
Ejemplo n.º 18
0
 def setup (self):
     self.data = {}
     self.keep_json_all_mapping = True
     self.conf = ConfigLoader()
Ejemplo n.º 19
0
class CravatReport:
    def __init__(self, cmd_args, status_writer=None):
        self.status_writer = status_writer
        global parser
        for ag in parser._action_groups:
            if ag.title == 'optional arguments':
                for a in ag._actions:
                    if '-t' in a.option_strings:
                        ag._actions.remove(a)
        self.parse_cmd_args(parser, cmd_args)
        self.cursor = None
        self.cf = None
        self.filtertable = 'filter'
        self.colinfo = {}
        self.colnos = {}
        self.newcolnos = {}
        self.var_added_cols = []
        self.summarizing_modules = []
        self.columngroups = {}
        self.column_subs = {}
        self.column_sub_allow_partial_match = {}
        self.colname_conversion = {}
        self._setup_logger()
        self.warning_msgs = []

    def parse_cmd_args(self, parser, cmd_args):
        cmd_args = clean_args(cmd_args)
        parsed_args = parser.parse_args(cmd_args)
        self.parsed_args = parsed_args
        self.dbpath = parsed_args.dbpath
        self.filterpath = parsed_args.filterpath
        self.filtername = parsed_args.filtername
        self.filterstring = parsed_args.filterstring
        self.confs = None
        if parsed_args.confs is not None:
            confs = parsed_args.confs.lstrip('\'').rstrip('\'').replace(
                "'", '"')
            self.confs = json.loads(confs)
            if 'filter' in self.confs:
                self.filter = self.confs['filter']
            else:
                self.filter = None
        if parsed_args.output_dir is not None:
            self.output_dir = parsed_args.output_dir
        else:
            self.output_dir = os.path.dirname(self.dbpath)
        self.savepath = parsed_args.savepath
        if self.savepath is not None and os.path.dirname(self.savepath) == '':
            self.savepath = os.path.join(self.output_dir, self.savepath)
        self.confpath = parsed_args.confpath
        self.conf = ConfigLoader(job_conf_path=self.confpath)
        self.module_name = parsed_args.module_name
        if self.conf is not None:
            self.module_conf = self.conf.get_module_conf(self.module_name)
        else:
            self.module_conf = None
        if hasattr(parsed_args, 'reporttypes'):
            self.report_types = parsed_args.reporttypes
        self.output_basename = os.path.basename(self.dbpath)[:-7]
        status_fname = '{}.status.json'.format(self.output_basename)
        self.status_fpath = os.path.join(self.output_dir, status_fname)
        self.nogenelevelonvariantlevel = parsed_args.nogenelevelonvariantlevel
        if parsed_args.inputfiles is None and parsed_args.dbpath is not None:
            db = sqlite3.connect(parsed_args.dbpath)
            c = db.cursor()
            q = 'select colval from info where colkey="_input_paths"'
            c.execute(q)
            r = c.fetchone()
            if r is not None:
                parsed_args.inputfiles = []
                s = r[0]
                if ' ' in s:
                    s = s.replace("'", '"')
                s = json.loads(r[0].replace("'", '"'))
                for k in s:
                    input_path = s[k]
                    parsed_args.inputfiles.append(input_path)
        self.args = parsed_args

    async def prep(self):
        await self.connect_db()
        await self.load_filter()

    def _setup_logger(self):
        if hasattr(self, 'no_log') and self.no_log:
            return
        try:
            self.logger = logging.getLogger('cravat.' + self.module_name)
        except Exception as e:
            self._log_exception(e)
        self.error_logger = logging.getLogger('error.' + self.module_name)
        self.unique_excs = []

    def _log_exception(self, e, halt=True):
        if halt:
            raise e
        else:
            if self.logger:
                self.logger.exception(e)

    async def getjson(self, level):
        ret = None
        if await self.table_exists(level) == False:
            return ret
        for row in await self.cf.getiterator(level):
            row = self.substitute_val(level, row)
            return json.dumps(row)

    def substitute_val(self, level, row):
        if level in self.column_subs:
            column_sub_dict = self.column_subs[level]
            column_sub_allow_partial_match = self.column_sub_allow_partial_match[
                level]
            for colno in column_sub_dict:
                column_sub = column_sub_dict[colno]
                value = row[colno]
                if value is not None:
                    if column_sub_allow_partial_match[colno]:
                        for target, substitution in column_sub.items():
                            value = target.sub(substitution, value)
                    else:
                        if value in column_sub:
                            value = column_sub[value]
                    row[colno] = value
        return row

    def process_datarow(self, args):
        datarow = args[0]
        should_skip_some_cols = args[1]
        level = args[2]
        gene_summary_datas = args[3]
        if datarow is None:
            return None
        datarow = list(datarow)
        if should_skip_some_cols:
            datarow = [
                datarow[colno] for colno in range(num_total_cols)
                if colno not in colnos_to_skip
            ]
        if level == 'variant':
            # adds gene level data to variant level.
            if self.nogenelevelonvariantlevel == False and hugo_present:
                hugo = datarow[self.colnos['variant']['base__hugo']]
                loop = asyncio.get_event_loop()
                future = asyncio.ensure_future(self.cf.get_gene_row(hugo),
                                               loop)
                generow = future.result()
                if generow is None:
                    datarow.extend(
                        [None for i in range(len(self.var_added_cols))])
                else:
                    datarow.extend([
                        generow[self.colnos['gene'][colname]]
                        for colname in self.var_added_cols
                    ])
        elif level == 'gene':
            # adds summary data to gene level.
            hugo = datarow[0]
            for mi, _, _ in self.summarizing_modules:
                module_name = mi.name
                [gene_summary_data, cols] = gene_summary_datas[module_name]
                if hugo in gene_summary_data and gene_summary_data[
                        hugo] is not None and len(
                            gene_summary_data[hugo]) == len(cols):
                    datarow.extend(
                        [gene_summary_data[hugo][col['name']] for col in cols])
                else:
                    datarow.extend([None for v in cols])
        # re-orders data row.
        new_datarow = []
        colnos = self.colnos[level]
        for colname in [
                col['col_name'] for col in self.colinfo[level]['columns']
        ]:
            if colname in self.colname_conversion[level]:
                newcolname = self.colname_conversion[level][colname]
                if newcolname in colnos:
                    colno = colnos[newcolname]
                else:
                    self.logger.info(
                        'column name does not exist in data: {}'.format(
                            colname))
                    continue
            else:
                colno = colnos[colname]
            value = datarow[colno]
            new_datarow.append(value)
        # does report substitution.
        new_datarow = self.substitute_val(level, new_datarow)
        if hasattr(self,
                   'keep_json_all_mapping') == False and level == 'variant':
            colno = self.colnos['variant']['base__all_mappings']
            all_map = json.loads(new_datarow[colno])
            newvals = []
            for hugo in all_map:
                for maprow in all_map[hugo]:
                    [protid, protchange, so, transcript, rnachange] = maprow
                    if protid == None:
                        protid = '(na)'
                    if protchange == None:
                        protchange = '(na)'
                    if rnachange == None:
                        rnachange = '(na)'
                    newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange
                    newvals.append(newval)
            newvals.sort()
            newcell = '; '.join(newvals)
            new_datarow[colno] = newcell
        return new_datarow

    async def run_level(self, level):
        ret = await self.table_exists(level)
        if ret == False:
            return
        gene_summary_datas = {}
        if level == 'variant':
            await self.cf.make_filtered_uid_table()
        elif level == 'gene':
            await self.cf.make_filtered_hugo_table()
            for mi, o, cols in self.summarizing_modules:
                if hasattr(o, 'build_gene_collection'):
                    msg = 'Obsolete module [{}] for gene level summarization. Update the module to get correct gene level summarization.'.format(
                        mi.name)
                    self.warning_msgs.append(msg)
                    print('===Warning: {}'.format(msg))
                    gene_summary_data = {}
                else:
                    gene_summary_data = await o.get_gene_summary_data(self.cf)
                gene_summary_datas[mi.name] = [gene_summary_data, cols]
                for col in cols:
                    if 'category' in col and col['category'] in [
                            'single', 'multi'
                    ]:
                        for i in range(len(self.colinfo[level]['columns'])):
                            colinfo_col = self.colinfo[level]['columns'][i]
                            if mi.name in ['hg38', 'tagsampler']:
                                grp_name = 'base'
                            else:
                                grp_name = mi.name
                            if colinfo_col[
                                    'col_name'] == grp_name + '__' + col[
                                        'name']:
                                break
                        cats = []
                        for hugo in gene_summary_data:
                            val = gene_summary_data[hugo][col['name']]
                            if len(colinfo_col['reportsub']) > 0:
                                if val in colinfo_col['reportsub']:
                                    val = colinfo_col['reportsub'][val]
                            if val not in cats:
                                cats.append(val)
                        self.colinfo[level]['columns'][i]['col_cats'] = cats
        self.write_preface(level)
        self.write_header(level)
        if level == 'variant':
            hugo_present = 'base__hugo' in self.colnos['variant']
        datacols, datarows = await self.cf.get_filtered_iterator(level)
        num_total_cols = len(datacols)
        colnos_to_skip = []
        if level == 'gene':
            for colno in range(len(datacols)):
                if datacols[colno] in constants.legacy_gene_level_cols_to_skip:
                    colnos_to_skip.append(colno)
        should_skip_some_cols = len(colnos_to_skip) > 0
        if level == 'variant' and self.args.separatesample:
            write_variant_sample_separately = True
            sample_newcolno = self.newcolnos['variant']['base__samples']
        else:
            write_variant_sample_separately = False
        colnos = self.colnos[level]
        newcolnos = self.newcolnos[level]
        all_mappings_newcolno = self.newcolnos['variant']['base__all_mappings']
        for datarow in datarows:
            if datarow is None:
                continue
            datarow = list(datarow)
            if should_skip_some_cols:
                datarow = [
                    datarow[colno] for colno in range(num_total_cols)
                    if colno not in colnos_to_skip
                ]
            if level == 'variant':
                # adds gene level data to variant level.
                if self.nogenelevelonvariantlevel == False and hugo_present:
                    hugo = datarow[self.colnos['variant']['base__hugo']]
                    generow = await self.cf.get_gene_row(hugo)
                    if generow is None:
                        datarow.extend(
                            [None for i in range(len(self.var_added_cols))])
                    else:
                        datarow.extend([
                            generow[self.colnos['gene'][colname]]
                            for colname in self.var_added_cols
                        ])
            elif level == 'gene':
                # adds summary data to gene level.
                hugo = datarow[0]
                for mi, _, _ in self.summarizing_modules:
                    module_name = mi.name
                    [gene_summary_data, cols] = gene_summary_datas[module_name]
                    if hugo in gene_summary_data and gene_summary_data[
                            hugo] is not None and len(
                                gene_summary_data[hugo]) == len(cols):
                        datarow.extend([
                            gene_summary_data[hugo][col['name']]
                            for col in cols
                        ])
                    else:
                        datarow.extend([None for v in cols])
            # re-orders data row.
            new_datarow = []
            for colname in [
                    col['col_name'] for col in self.colinfo[level]['columns']
            ]:
                if colname in self.colname_conversion[level]:
                    oldcolname = self.colname_conversion[level][colname]
                    if oldcolname in colnos:
                        colno = colnos[oldcolname]
                    else:
                        self.logger.info(
                            'column name does not exist in data: {}'.format(
                                oldcolname))
                        continue
                else:
                    colno = colnos[colname]
                value = datarow[colno]
                new_datarow.append(value)
            # does report substitution.
            new_datarow = self.substitute_val(level, new_datarow)
            if hasattr(
                    self,
                    'keep_json_all_mapping') == False and level == 'variant':
                all_map = json.loads(new_datarow[all_mappings_newcolno])
                newvals = []
                for hugo in all_map:
                    for maprow in all_map[hugo]:
                        [protid, protchange, so, transcript,
                         rnachange] = maprow
                        if protid == None:
                            protid = '(na)'
                        if protchange == None:
                            protchange = '(na)'
                        if rnachange == None:
                            rnachange = '(na)'
                        newval = transcript + ':' + hugo + ':' + protid + ':' + so + ':' + protchange + ':' + rnachange
                        newvals.append(newval)
                newvals.sort()
                newcell = '; '.join(newvals)
                new_datarow[all_mappings_newcolno] = newcell
            if write_variant_sample_separately:
                samples = new_datarow[sample_newcolno]
                if samples is not None:
                    samples = samples.split(';')
                    for sample in samples:
                        sample_datarow = new_datarow
                        sample_datarow[sample_newcolno] = sample
                        self.write_table_row(sample_datarow)
                else:
                    self.write_table_row(new_datarow)
            else:
                self.write_table_row(new_datarow)

    async def store_mapper(self):
        q = 'select colval from info where colkey="_mapper"'
        await self.cursor.execute(q)
        r = await self.cursor.fetchone()
        if r is None:
            self.mapper_name = 'hg38'
        else:
            self.mapper_name = r[0].split(':')[0]

    async def run(self, tab='all'):
        start_time = time.time()
        if not (hasattr(self, 'no_log') and self.no_log):
            self.logger.info('started: %s' %
                             time.asctime(time.localtime(start_time)))
        if self.module_conf is not None and self.status_writer is not None:
            if self.parsed_args.do_not_change_status == False:
                self.status_writer.queue_status_update(
                    'status',
                    'Started {} ({})'.format(self.module_conf['title'],
                                             self.module_name))
        if self.setup() == False:
            return
        if tab == 'all':
            for level in await self.cf.get_result_levels():
                if await self.table_exists(level):
                    await self.make_col_info(level)
            for level in await self.cf.get_result_levels():
                if await self.table_exists(level):
                    await self.run_level(level)
        else:
            if tab in ['variant', 'gene']:
                for level in ['variant', 'gene']:
                    if await self.table_exists(level):
                        await self.make_col_info(level)
            else:
                await self.make_col_info(tab)
            await self.run_level(tab)
        if self.module_conf is not None and self.status_writer is not None:
            if self.parsed_args.do_not_change_status == False:
                self.status_writer.queue_status_update(
                    'status',
                    'Finished {} ({})'.format(self.module_conf['title'],
                                              self.module_name))
        end_time = time.time()
        if not (hasattr(self, 'no_log') and self.no_log):
            self.logger.info('finished: {0}'.format(
                time.asctime(time.localtime(end_time))))
            run_time = end_time - start_time
            self.logger.info('runtime: {0:0.3f}'.format(run_time))
        ret = self.end()
        return ret

    async def get_variant_colinfo(self):
        self.setup()
        level = 'variant'
        if await self.table_exists(level):
            await self.make_col_info(level)
        level = 'gene'
        if await self.table_exists(level):
            await self.make_col_info(level)
        return self.colinfo

    def setup(self):
        pass

    def end(self):
        pass

    def write_preface(self, level):
        pass

    def write_header(self, level):
        pass

    def write_table_row(self, row):
        pass

    async def make_col_info(self, level):
        await self.store_mapper()
        cravat_conf = self.conf.get_cravat_conf()
        if 'report_module_order' in cravat_conf:
            priority_colgroupnames = cravat_conf['report_module_order']
        else:
            priority_colgroupnames = [
                'base', 'hg38', 'hg19', 'hg18', 'tagsampler'
            ]
        # level-specific column groups
        self.columngroups[level] = []
        sql = 'select name, displayname from ' + level + '_annotator'
        await self.cursor.execute(sql)
        rows = await self.cursor.fetchall()
        for row in rows:
            (name, displayname) = row
            self.columngroups[level].append({
                'name': name,
                'displayname': displayname,
                'count': 0
            })
        # level-specific column names
        header_table = level + '_header'
        coldefs = []
        sql = 'select col_def from ' + header_table
        await self.cursor.execute(sql)
        for row in await self.cursor.fetchall():
            coljson = row[0]
            coldef = ColumnDefinition({})
            coldef.from_json(coljson)
            coldefs.append(coldef)
        columns = []
        self.colnos[level] = {}
        colcount = 0
        # level-specific column details
        for coldef in coldefs:
            self.colnos[level][coldef.name] = colcount
            colcount += 1
            if coldef.category in ['single', 'multi'] and len(
                    coldef.categories) == 0:
                sql = 'select distinct {} from {}'.format(coldef.name, level)
                await self.cursor.execute(sql)
                rs = await self.cursor.fetchall()
                for r in rs:
                    coldef.categories.append(r[0])
            [colgrpname, colonlyname] = coldef.name.split('__')
            column = coldef.get_colinfo()
            columns.append(column)
            for columngroup in self.columngroups[level]:
                if columngroup['name'] == colgrpname:
                    columngroup['count'] += 1
        # adds gene level columns to variant level.
        if self.nogenelevelonvariantlevel == False and level == 'variant' and await self.table_exists(
                'gene'):
            modules_to_add = []
            q = 'select name from gene_annotator'
            await self.cursor.execute(q)
            gene_annotators = [v[0] for v in await self.cursor.fetchall()]
            modules_to_add = [m for m in gene_annotators if m != 'base']
            for module in modules_to_add:
                if not module in gene_annotators:
                    continue
                cols = []
                q = 'select col_def from gene_header where col_name like "{}__%"'.format(
                    module)
                await self.cursor.execute(q)
                rs = await self.cursor.fetchall()
                for r in rs:
                    cd = ColumnDefinition({})
                    cd.from_json(r[0])
                    cols.append(cd)
                q = 'select displayname from gene_annotator where name="{}"'.format(
                    module)
                await self.cursor.execute(q)
                r = await self.cursor.fetchone()
                displayname = r[0]
                self.columngroups[level].append({
                    'name': module,
                    'displayname': displayname,
                    'count': len(cols)
                })
                for coldef in cols:
                    self.colnos[level][coldef.name] = colcount
                    colcount += 1
                    if coldef.category in ['category', 'multicategory'
                                           ] and len(coldef.categories) == 0:
                        sql = 'select distinct {} from {}'.format(
                            coldef.name, level)
                        await self.cursor.execute(sql)
                        rs = await self.cursor.fetchall()
                        for r in rs:
                            coldef.categories.append(r[0])
                    column = coldef.get_colinfo()
                    columns.append(column)
                    self.var_added_cols.append(coldef.name)
        # Gene level summary columns
        if level == 'gene':
            q = 'select name from variant_annotator'
            await self.cursor.execute(q)
            done_var_annotators = [v[0] for v in await self.cursor.fetchall()]
            self.summarizing_modules = []
            local_modules = au.get_local_module_infos_of_type('annotator')
            local_modules.update(
                au.get_local_module_infos_of_type('postaggregator'))
            summarizer_module_names = []
            for module_name in done_var_annotators:
                if module_name in [
                        'base', 'hg19', 'hg18', 'extra_vcf_info',
                        'extra_variant_info'
                ]:
                    continue
                if module_name not in local_modules:
                    print(
                        '            [{}] module does not exist in the system. Gene level summary for this module is skipped.'
                        .format(module_name))
                    continue
                module = local_modules[module_name]
                if 'can_summarize_by_gene' in module.conf:
                    summarizer_module_names.append(module_name)
            local_modules[self.mapper_name] = au.get_local_module_info(
                self.mapper_name)
            summarizer_module_names = [self.mapper_name
                                       ] + summarizer_module_names
            for module_name in summarizer_module_names:
                mi = local_modules[module_name]
                sys.path = sys.path + [os.path.dirname(mi.script_path)]
                if module_name in done_var_annotators:
                    annot_cls = util.load_class(mi.script_path,
                                                'CravatAnnotator')
                elif module_name == self.mapper_name:
                    annot_cls = util.load_class(mi.script_path, 'Mapper')
                annot = annot_cls(
                    [mi.script_path, '__dummy__', '-d', self.output_dir], {})
                '''
                cols = conf['gene_summary_output_columns']
                columngroup = {}
                columngroup['name'] = os.path.basename(mi.script_path).split('.')[0]
                columngroup['displayname'] = conf['title']
                columngroup['count'] = len(cols)
                '''
                cols = mi.conf['gene_summary_output_columns']
                columngroup = {
                    'name': mi.name,
                    'displayname': mi.title,
                    'count': len(cols),
                }
                self.columngroups[level].append(columngroup)
                for col in cols:
                    coldef = ColumnDefinition(col)
                    coldef.name = columngroup['name'] + '__' + coldef.name
                    coldef.genesummary = True
                    column = coldef.get_colinfo()
                    columns.append(column)
                self.summarizing_modules.append([mi, annot, cols])
                for col in cols:
                    fullname = module_name + '__' + col['name']
                    self.colnos[level][fullname] = len(self.colnos[level])
        # re-orders columns groups.
        colgrps = self.columngroups[level]
        newcolgrps = []
        for priority_colgrpname in priority_colgroupnames:
            for colgrp in colgrps:
                if colgrp['name'] == priority_colgrpname:
                    if colgrp['name'] in [self.mapper_name, 'tagsampler']:
                        newcolgrps[0]['count'] += colgrp['count']
                    else:
                        newcolgrps.append(colgrp)
                    break
        colpos = 0
        for colgrp in newcolgrps:
            colgrp['lastcol'] = colpos + colgrp['count']
            colpos = colgrp['lastcol']
        colgrpnames = [
            v['displayname'] for v in colgrps
            if v['name'] not in priority_colgroupnames
        ]
        colgrpnames.sort()
        for colgrpname in colgrpnames:
            for colgrp in colgrps:
                if colgrp['displayname'] == colgrpname:
                    colgrp['lastcol'] = colpos + colgrp['count']
                    newcolgrps.append(colgrp)
                    colpos += colgrp['count']
                    break
        # re-orders columns.
        self.colname_conversion[level] = {}
        new_columns = []
        self.newcolnos[level] = {}
        newcolno = 0
        for colgrp in newcolgrps:
            colgrpname = colgrp['name']
            for col in columns:
                colname = col['col_name']
                [grpname, oricolname] = colname.split('__')
                if colgrpname == 'base' and grpname in [
                        self.mapper_name, 'tagsampler'
                ]:
                    newcolname = 'base__' + colname.split('__')[1]
                    self.colname_conversion[level][newcolname] = colname
                    col['col_name'] = newcolname
                    new_columns.append(col)
                    self.newcolnos[level][newcolname] = newcolno
                    #self.colnos[level][newcolname] = colno
                    #del self.colnos[level][oldcolname]
                elif grpname == colgrpname:
                    new_columns.append(col)
                    self.newcolnos[level][colname] = newcolno
                else:
                    continue
                newcolno += 1
        self.colinfo[level] = {'colgroups': newcolgrps, 'columns': new_columns}
        # report substitution
        if level in ['variant', 'gene']:
            reportsubtable = level + '_reportsub'
            if await self.table_exists(reportsubtable):
                q = 'select * from {}'.format(reportsubtable)
                await self.cursor.execute(q)
                rs = await self.cursor.fetchall()
                self.report_substitution = {}
                for r in rs:
                    module = r[0]
                    sub = json.loads(r[1])
                    self.report_substitution[module] = sub
                self.column_subs[level] = {}
                self.column_sub_allow_partial_match[level] = {}
                for i in range(len(new_columns)):
                    column = new_columns[i]
                    [module, col] = column['col_name'].split('__')
                    if module in [self.mapper_name]:
                        module = 'base'
                    if module in self.report_substitution:
                        sub = self.report_substitution[module]
                        if col in sub:
                            if module in [
                                    'base', self.mapper_name
                            ] and col in ['all_mappings', 'all_so']:
                                allow_partial_match = True
                                self.column_subs[level][i] = {
                                    re.compile(fr'\b{key}\b'): val
                                    for key, val in sub[col].items()
                                }
                            else:
                                allow_partial_match = False
                                self.column_subs[level][i] = sub[col]
                            self.column_sub_allow_partial_match[level][
                                i] = allow_partial_match
                            new_columns[i]['reportsub'] = sub[col]

    async def connect_db(self, dbpath=None):
        if dbpath != None:
            self.dbpath = dbpath
        if self.dbpath == None:
            sys.stderr.write('Provide a path to aggregator output')
            exit()
        if os.path.exists(self.dbpath) == False:
            sys.stderr.write(self.dbpath + ' does not exist.')
            exit()
        self.conn = await aiosqlite3.connect(self.dbpath)
        self.cursor = await self.conn.cursor()

    async def load_filter(self):
        self.cf = await CravatFilter.create(dbpath=self.dbpath)
        await self.cf.loadfilter(filter=self.filter,
                                 filterpath=self.filterpath,
                                 filtername=self.filtername,
                                 filterstring=self.filterstring)

    async def table_exists(self, tablename):
        sql = 'select name from sqlite_master where ' + \
            'type="table" and name="' + tablename + '"'
        await self.cursor.execute(sql)
        row = await self.cursor.fetchone()
        if row == None:
            ret = False
        else:
            ret = True
        return ret
Ejemplo n.º 20
0
def get_cravat_conf ():
    from cravat.config_loader import ConfigLoader
    confpath = get_main_conf_path()
    conf = ConfigLoader()
    cravat_conf = conf.get_cravat_conf()
    return cravat_conf
Ejemplo n.º 21
0
 def __init__(self, dir_path, name=None):
     self.directory = dir_path
     if name is None:
         self.name = os.path.basename(self.directory)
     else:
         self.name = name
     self.script_path = os.path.join(self.directory, self.name+'.py')
     #if importlib.util.find_spec('cython') is not None:
     #    pyx_path = self.script_path + 'x'
     #    if os.path.exists(pyx_path):
     #        self.script_path = pyx_path
     self.script_exists = os.path.exists(self.script_path)
     self.conf_path = os.path.join(self.directory, self.name+'.yml')
     self.conf_exists = os.path.exists(self.conf_path)
     self.exists = self.conf_exists
     startofinstall_path = os.path.join(self.directory, 'startofinstall')
     if os.path.exists(startofinstall_path):
         endofinstall_path = os.path.join(self.directory, 'endofinstall')
         if os.path.exists(endofinstall_path):
             self.exists = True
         else:
             self.exists = False
     self.data_dir = os.path.join(dir_path, 'data')
     self.data_dir_exists = os.path.isdir(self.data_dir)
     self.has_data = self.data_dir_exists \
                     and len(os.listdir(self.data_dir)) > 0
     self.test_dir = os.path.join(dir_path, 'test')
     self.test_dir_exists = os.path.isdir(self.test_dir)
     self.has_test = self.test_dir_exists \
                     and os.path.isfile(os.path.join(self.test_dir, 'input')) \
                     and  os.path.isfile(os.path.join(self.test_dir, 'key'))
     self.readme_path = os.path.join(self.directory, self.name+'.md')
     self.readme_exists = os.path.exists(self.readme_path)
     if self.readme_exists:
         with open(self.readme_path) as f:
             self.readme = f.read()
     else:
         self.readme = ''
     self.helphtml_path = os.path.join(self.directory, 'help.html')
     self.helphtml_exists = os.path.exists(self.helphtml_path)
     self.conf = {}
     if self.conf_exists:
         from cravat.config_loader import ConfigLoader
         conf = ConfigLoader()
         self.conf = conf.get_module_conf(self.name)
     self.type = self.conf.get('type')
     self.version = self.conf.get('version')
     self.description = self.conf.get('description')
     self.hidden = self.conf.get('hidden',False)
     dev_dict = self.conf.get('developer')
     if not(type(dev_dict)==dict):
         dev_dict = {}
     self.developer = get_developer_dict(**dev_dict)
     if 'type' not in self.conf:
         self.conf['type'] = 'unknown'
     self.type = self.conf['type']
     self.level = self.conf.get('level')
     self.input_format = self.conf.get('input_format')
     self.secondary_module_names = list(self.conf.get('secondary_inputs',{}))
     if self.type == 'annotator':
         if self.level == 'variant':
             self.output_suffix = self.name + '.var'
         elif self.level == 'gene':
             self.output_suffix = self.name + '.gen'
         else:
             self.output_suffix = self. name + '.' + self.type
     self.title = self.conf.get('title',self.name)
     self.disk_size = None
     self.tags = self.conf.get('tags',[])
     self.datasource = str(self.conf.get('datasource',''))
     self.smartfilters = self.conf.get('smartfilters')
     self.groups = self.conf.get('groups')