def _alter_tables(self): # annotator table q = 'insert or replace into {:} values ("{:}", "{:}", "{}")'.format( self.level + "_annotator", self.module_name, self.conf["title"], self.conf["version"], ) self.cursor_w.execute(q) # data table and header table header_table_name = self.level + "_header" for col_d in self.conf["output_columns"]: col_def = ColumnDefinition(col_d) colname = col_def.name coltype = col_def.type # data table try: self.cursor.execute(f"select {colname} from {self.level} limit 1") except: q = ( "alter table " + self.level + " add column " + colname + " " + self.cr_type_to_sql[coltype] ) self.cursor_w.execute(q) # header table # use prepared statement to allow " characters in colcats and coldesc q = "insert or replace into {} values (?, ?)".format(header_table_name) self.cursor_w.execute(q, [colname, col_def.get_json()]) self.dbconn.commit()
def _alter_tables(self): # annotator table q = 'insert or replace into {:} values ("{:}", "{:}", "{}")'.format( self.level + '_annotator', self.module_name, self.conf['title'], self.conf['version']) self.cursor_w.execute(q) # data table and header table header_table_name = self.level + '_header' for col_d in self.conf['output_columns']: col_def = ColumnDefinition(col_d) colname = col_def.name coltype = col_def.type # data table try: self.cursor.execute( f'select {colname} from {self.level} limit 1') except: q = 'alter table ' + self.level + ' add column ' +\ colname + ' ' + self.cr_type_to_sql[coltype] self.cursor_w.execute(q) # header table # use prepared statement to allow " characters in colcats and coldesc q = 'insert or replace into {} values (?, ?)'.format( header_table_name) self.cursor_w.execute(q, [colname, col_def.get_json()]) self.dbconn.commit()
def fill_categories(self): for col_d in self.conf["output_columns"]: col_def = ColumnDefinition(col_d) if col_def.category not in ["single", "multi"]: continue col_name = col_def.name q = "select distinct {} from {}".format(col_name, self.level) self.cursor.execute(q) col_cats = [] for r in self.cursor: col_cat_str = r[0] if r[0] is not None else "" for col_cat in col_cat_str.split(";"): if col_cat not in col_cats: col_cats.append(col_cat) col_cats.sort() col_def.categories = col_cats q = "update {}_header set col_def=? where col_name=?".format(self.level) self.cursor.execute(q, [col_def.get_json(), col_def.name])
def fill_categories(self): header_table = self.level + '_header' coldefs = [] if LooseVersion( au.get_current_package_version()) >= LooseVersion('1.5.0'): sql = f'select col_def from {header_table}' self.cursor.execute(sql) for row in self.cursor: coljson = row[0] coldef = ColumnDefinition({}) coldef.from_json(coljson) coldefs.append(coldef) else: sql = f'pragma table_info("{header_table}")' self.cursor.execute(sql) header_cols = [row[1] for row in self.cursor.fetchall()] select_order = [ cname for cname in ColumnDefinition.db_order if cname in header_cols ] sql = 'select {} from {}'.format(', '.join(select_order), header_table) self.cursor.execute(sql) column_headers = self.cursor.fetchall() for column_header in column_headers: coldef = ColumnDefinition({}) coldef.from_row(column_header, order=select_order) coldefs.append(coldef) for coldef in coldefs: col_cats = coldef.categories if coldef.category in ['single', 'multi']: if col_cats is not None and len(col_cats) == 0: q = f'select distinct {coldef.name} from {self.level}' self.cursor.execute(q) col_set = set([]) for r in self.cursor: if r[0] == None: continue col_set.update(r[0].split(';')) col_cats = list(col_set) col_cats = self.do_reportsub_col_cats( coldef.name, col_cats) else: col_cats = self.do_reportsub_col_cats( coldef.name, col_cats) col_cats.sort() coldef.categories = col_cats self.update_col_def(coldef) self.dbconn.commit()
def fill_categories (self): for col_d in self.conf['output_columns']: col_def = ColumnDefinition(col_d) if col_def.category not in ['single', 'multi']: continue col_name = col_def.name q = 'select distinct {} from {}'.format(col_name, self.level) self.cursor.execute(q) col_cats = [] for r in self.cursor: col_cat_str = r[0] if r[0] is not None else '' for col_cat in col_cat_str.split(';'): if col_cat not in col_cats: col_cats.append(col_cat) col_cats.sort() col_def.categories = col_cats q = 'update {}_header set col_def=? where col_name=?'.format(self.level) self.cursor.execute(q, [col_def.get_json(), col_def.name]) self.dbconn.commit()
async def make_col_info(self, level, conn=None, cursor=None): self.colnames_to_display[level] = [] await self.exec_db(self.store_mapper) cravat_conf = self.conf.get_cravat_conf() if "report_module_order" in cravat_conf: priority_colgroupnames = cravat_conf["report_module_order"] else: priority_colgroupnames = [ "base", "hg38", "hg19", "hg18", "tagsampler" ] # level-specific column groups self.columngroups[level] = [] sql = "select name, displayname from " + level + "_annotator" await cursor.execute(sql) rows = await cursor.fetchall() for row in rows: (name, displayname) = row self.columngroups[level].append({ "name": name, "displayname": displayname, "count": 0 }) # level-specific column names header_table = level + "_header" coldefs = [] sql = "select col_def from " + header_table await cursor.execute(sql) for row in await cursor.fetchall(): coljson = row[0] coldef = ColumnDefinition({}) coldef.from_json(coljson) coldefs.append(coldef) columns = [] self.colnos[level] = {} colcount = 0 # level-specific column details for coldef in coldefs: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ["single", "multi"] and len( coldef.categories) == 0: sql = "select distinct {} from {}".format(coldef.name, level) await cursor.execute(sql) rs = await cursor.fetchall() for r in rs: coldef.categories.append(r[0]) [colgrpname, _] = coldef.name.split("__") column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, colgrpname) for columngroup in self.columngroups[level]: if columngroup["name"] == colgrpname: columngroup["count"] += 1 # adds gene level columns to variant level. if (self.nogenelevelonvariantlevel == False and level == "variant" and await self.exec_db(self.table_exists, "gene")): modules_to_add = [] q = "select name from gene_annotator" await cursor.execute(q) gene_annotators = [v[0] for v in await cursor.fetchall()] modules_to_add = [m for m in gene_annotators if m != "base"] for module in modules_to_add: cols = [] q = 'select col_def from gene_header where col_name like "{}__%"'.format( module) await cursor.execute(q) rs = await cursor.fetchall() for r in rs: cd = ColumnDefinition({}) cd.from_json(r[0]) cols.append(cd) q = 'select displayname from gene_annotator where name="{}"'.format( module) await cursor.execute(q) r = await cursor.fetchone() displayname = r[0] self.columngroups[level].append({ "name": module, "displayname": displayname, "count": len(cols) }) for coldef in cols: self.colnos[level][coldef.name] = colcount colcount += 1 if (coldef.category in ["category", "multicategory"] and len(coldef.categories) == 0): sql = "select distinct {} from {}".format( coldef.name, level) await cursor.execute(sql) rs = await cursor.fetchall() for r in rs: coldef.categories.append(r[0]) column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, module) self.var_added_cols.append(coldef.name) # Gene level summary columns if level == "gene": q = "select name from variant_annotator" await cursor.execute(q) done_var_annotators = [v[0] for v in await cursor.fetchall()] self.summarizing_modules = [] local_modules = au.get_local_module_infos_of_type("annotator") local_modules.update( au.get_local_module_infos_of_type("postaggregator")) summarizer_module_names = [] for module_name in done_var_annotators: if module_name in [ "base", "hg19", "hg18", "extra_vcf_info", "extra_variant_info", ]: continue if module_name not in local_modules: if self.args.silent == False and module_name != 'original_input': print( " [{}] module does not exist in the system. Gene level summary for this module is skipped." .format(module_name)) continue module = local_modules[module_name] if "can_summarize_by_gene" in module.conf: summarizer_module_names.append(module_name) local_modules[self.mapper_name] = au.get_local_module_info( self.mapper_name) summarizer_module_names = [self.mapper_name ] + summarizer_module_names for module_name in summarizer_module_names: mi = local_modules[module_name] sys.path = sys.path + [os.path.dirname(mi.script_path)] if module_name in done_var_annotators: annot_cls = util.load_class(mi.script_path, "CravatAnnotator") elif module_name == self.mapper_name: annot_cls = util.load_class(mi.script_path, "Mapper") cmd = { "script_path": mi.script_path, "input_file": "__dummy__", "output_dir": self.output_dir, } annot = annot_cls(cmd) cols = mi.conf["gene_summary_output_columns"] columngroup = { "name": mi.name, "displayname": mi.title, "count": len(cols), } self.columngroups[level].append(columngroup) for col in cols: coldef = ColumnDefinition(col) coldef.name = columngroup["name"] + "__" + coldef.name coldef.genesummary = True column = coldef.get_colinfo() columns.append(column) self.add_conditional_to_colnames_to_display( level, column, mi.name) self.summarizing_modules.append([mi, annot, cols]) for col in cols: fullname = module_name + "__" + col["name"] self.colnos[level][fullname] = len(self.colnos[level]) # re-orders columns groups. colgrps = self.columngroups[level] newcolgrps = [] for priority_colgrpname in priority_colgroupnames: for colgrp in colgrps: if colgrp["name"] == priority_colgrpname: if colgrp["name"] in [self.mapper_name, "tagsampler"]: newcolgrps[0]["count"] += colgrp["count"] else: newcolgrps.append(colgrp) break colpos = 0 for colgrp in newcolgrps: colgrp["lastcol"] = colpos + colgrp["count"] colpos = colgrp["lastcol"] colgrpnames = [ v["displayname"] for v in colgrps if v["name"] not in priority_colgroupnames ] colgrpnames.sort() for colgrpname in colgrpnames: for colgrp in colgrps: if colgrp["displayname"] == colgrpname: colgrp["lastcol"] = colpos + colgrp["count"] newcolgrps.append(colgrp) colpos += colgrp["count"] break # re-orders columns. self.colname_conversion[level] = {} new_columns = [] self.newcolnos[level] = {} newcolno = 0 new_colnames_to_display = [] for colgrp in newcolgrps: colgrpname = colgrp["name"] for col in columns: colname = col["col_name"] [grpname, _] = colname.split("__") if colgrpname == "base" and grpname in [ self.mapper_name, "tagsampler" ]: newcolname = "base__" + colname.split("__")[1] self.colname_conversion[level][newcolname] = colname col["col_name"] = newcolname new_columns.append(col) self.newcolnos[level][newcolname] = newcolno if newcolname in self.colnames_to_display[level]: new_colnames_to_display.append(newcolname) elif grpname == colgrpname: new_columns.append(col) self.newcolnos[level][colname] = newcolno if colname in self.colnames_to_display[level]: new_colnames_to_display.append(colname) else: continue newcolno += 1 self.colinfo[level] = {"colgroups": newcolgrps, "columns": new_columns} self.colnames_to_display[level] = new_colnames_to_display # report substitution if level in ["variant", "gene"]: reportsubtable = level + "_reportsub" if await self.exec_db(self.table_exists, reportsubtable): q = "select * from {}".format(reportsubtable) await cursor.execute(q) reportsub = { r[0]: json.loads(r[1]) for r in await cursor.fetchall() } self.column_subs[level] = [] for i, column in enumerate(new_columns): module, col = column["col_name"].split("__") if module == self.mapper_name: module = "base" if module in reportsub and col in reportsub[module]: self.column_subs[level].append( SimpleNamespace( module=module, col=col, index=i, subs=reportsub[module][col], )) new_columns[i]["reportsub"] = reportsub[module][col] # display_select_columns if (level in self.extract_columns_multilevel and len(self.extract_columns_multilevel[level]) > 0 ) or self.concise_report: self.display_select_columns[level] = True else: self.display_select_columns[level] = False # column numbers to display colno = 0 self.colnos_to_display[level] = [] for colgroup in self.colinfo[level]["colgroups"]: count = colgroup["count"] if count == 0: continue for col in self.colinfo[level]["columns"][colno:colno + count]: module_col_name = col["col_name"] if module_col_name in self.colnames_to_display[level]: include_col = True else: include_col = False if include_col: self.colnos_to_display[level].append(colno) colno += 1
async def make_col_info(self, level): await self.store_mapper() cravat_conf = self.conf.get_cravat_conf() if 'report_module_order' in cravat_conf: priority_colgroupnames = cravat_conf['report_module_order'] else: priority_colgroupnames = [ 'base', 'hg38', 'hg19', 'hg18', 'tagsampler' ] # level-specific column groups self.columngroups[level] = [] sql = 'select name, displayname from ' + level + '_annotator' await self.cursor.execute(sql) rows = await self.cursor.fetchall() for row in rows: (name, displayname) = row self.columngroups[level].append({ 'name': name, 'displayname': displayname, 'count': 0 }) # level-specific column names header_table = level + '_header' coldefs = [] sql = 'select col_def from ' + header_table await self.cursor.execute(sql) for row in await self.cursor.fetchall(): coljson = row[0] coldef = ColumnDefinition({}) coldef.from_json(coljson) coldefs.append(coldef) columns = [] self.colnos[level] = {} colcount = 0 # level-specific column details for coldef in coldefs: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ['single', 'multi'] and len( coldef.categories) == 0: sql = 'select distinct {} from {}'.format(coldef.name, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: coldef.categories.append(r[0]) [colgrpname, colonlyname] = coldef.name.split('__') column = coldef.get_colinfo() columns.append(column) for columngroup in self.columngroups[level]: if columngroup['name'] == colgrpname: columngroup['count'] += 1 # adds gene level columns to variant level. if self.nogenelevelonvariantlevel == False and level == 'variant' and await self.table_exists( 'gene'): modules_to_add = [] q = 'select name from gene_annotator' await self.cursor.execute(q) gene_annotators = [v[0] for v in await self.cursor.fetchall()] modules_to_add = [m for m in gene_annotators if m != 'base'] for module in modules_to_add: if not module in gene_annotators: continue cols = [] q = 'select col_def from gene_header where col_name like "{}__%"'.format( module) await self.cursor.execute(q) rs = await self.cursor.fetchall() for r in rs: cd = ColumnDefinition({}) cd.from_json(r[0]) cols.append(cd) q = 'select displayname from gene_annotator where name="{}"'.format( module) await self.cursor.execute(q) r = await self.cursor.fetchone() displayname = r[0] self.columngroups[level].append({ 'name': module, 'displayname': displayname, 'count': len(cols) }) for coldef in cols: self.colnos[level][coldef.name] = colcount colcount += 1 if coldef.category in ['category', 'multicategory' ] and len(coldef.categories) == 0: sql = 'select distinct {} from {}'.format( coldef.name, level) await self.cursor.execute(sql) rs = await self.cursor.fetchall() for r in rs: coldef.categories.append(r[0]) column = coldef.get_colinfo() columns.append(column) self.var_added_cols.append(coldef.name) # Gene level summary columns if level == 'gene': q = 'select name from variant_annotator' await self.cursor.execute(q) done_var_annotators = [v[0] for v in await self.cursor.fetchall()] self.summarizing_modules = [] local_modules = au.get_local_module_infos_of_type('annotator') local_modules.update( au.get_local_module_infos_of_type('postaggregator')) summarizer_module_names = [] for module_name in done_var_annotators: if module_name in [ 'base', 'hg19', 'hg18', 'extra_vcf_info', 'extra_variant_info' ]: continue if module_name not in local_modules: print( ' [{}] module does not exist in the system. Gene level summary for this module is skipped.' .format(module_name)) continue module = local_modules[module_name] if 'can_summarize_by_gene' in module.conf: summarizer_module_names.append(module_name) local_modules[self.mapper_name] = au.get_local_module_info( self.mapper_name) summarizer_module_names = [self.mapper_name ] + summarizer_module_names for module_name in summarizer_module_names: mi = local_modules[module_name] sys.path = sys.path + [os.path.dirname(mi.script_path)] if module_name in done_var_annotators: annot_cls = util.load_class(mi.script_path, 'CravatAnnotator') elif module_name == self.mapper_name: annot_cls = util.load_class(mi.script_path, 'Mapper') annot = annot_cls( [mi.script_path, '__dummy__', '-d', self.output_dir], {}) ''' cols = conf['gene_summary_output_columns'] columngroup = {} columngroup['name'] = os.path.basename(mi.script_path).split('.')[0] columngroup['displayname'] = conf['title'] columngroup['count'] = len(cols) ''' cols = mi.conf['gene_summary_output_columns'] columngroup = { 'name': mi.name, 'displayname': mi.title, 'count': len(cols), } self.columngroups[level].append(columngroup) for col in cols: coldef = ColumnDefinition(col) coldef.name = columngroup['name'] + '__' + coldef.name coldef.genesummary = True column = coldef.get_colinfo() columns.append(column) self.summarizing_modules.append([mi, annot, cols]) for col in cols: fullname = module_name + '__' + col['name'] self.colnos[level][fullname] = len(self.colnos[level]) # re-orders columns groups. colgrps = self.columngroups[level] newcolgrps = [] for priority_colgrpname in priority_colgroupnames: for colgrp in colgrps: if colgrp['name'] == priority_colgrpname: if colgrp['name'] in [self.mapper_name, 'tagsampler']: newcolgrps[0]['count'] += colgrp['count'] else: newcolgrps.append(colgrp) break colpos = 0 for colgrp in newcolgrps: colgrp['lastcol'] = colpos + colgrp['count'] colpos = colgrp['lastcol'] colgrpnames = [ v['displayname'] for v in colgrps if v['name'] not in priority_colgroupnames ] colgrpnames.sort() for colgrpname in colgrpnames: for colgrp in colgrps: if colgrp['displayname'] == colgrpname: colgrp['lastcol'] = colpos + colgrp['count'] newcolgrps.append(colgrp) colpos += colgrp['count'] break # re-orders columns. self.colname_conversion[level] = {} new_columns = [] self.newcolnos[level] = {} newcolno = 0 for colgrp in newcolgrps: colgrpname = colgrp['name'] for col in columns: colname = col['col_name'] [grpname, oricolname] = colname.split('__') if colgrpname == 'base' and grpname in [ self.mapper_name, 'tagsampler' ]: newcolname = 'base__' + colname.split('__')[1] self.colname_conversion[level][newcolname] = colname col['col_name'] = newcolname new_columns.append(col) self.newcolnos[level][newcolname] = newcolno #self.colnos[level][newcolname] = colno #del self.colnos[level][oldcolname] elif grpname == colgrpname: new_columns.append(col) self.newcolnos[level][colname] = newcolno else: continue newcolno += 1 self.colinfo[level] = {'colgroups': newcolgrps, 'columns': new_columns} # report substitution if level in ['variant', 'gene']: reportsubtable = level + '_reportsub' if await self.table_exists(reportsubtable): q = 'select * from {}'.format(reportsubtable) await self.cursor.execute(q) rs = await self.cursor.fetchall() self.report_substitution = {} for r in rs: module = r[0] sub = json.loads(r[1]) self.report_substitution[module] = sub self.column_subs[level] = {} self.column_sub_allow_partial_match[level] = {} for i in range(len(new_columns)): column = new_columns[i] [module, col] = column['col_name'].split('__') if module in [self.mapper_name]: module = 'base' if module in self.report_substitution: sub = self.report_substitution[module] if col in sub: if module in [ 'base', self.mapper_name ] and col in ['all_mappings', 'all_so']: allow_partial_match = True self.column_subs[level][i] = { re.compile(fr'\b{key}\b'): val for key, val in sub[col].items() } else: allow_partial_match = False self.column_subs[level][i] = sub[col] self.column_sub_allow_partial_match[level][ i] = allow_partial_match new_columns[i]['reportsub'] = sub[col]
def _setup_table(self): columns = [] unique_names = set() # annotator table annotator_table = self.level + '_annotator' if not self.append: q = f'drop table if exists {annotator_table}' self.cursor.execute(q) q = f'create table {annotator_table} (name text primary key, displayname text, version text)' self.cursor.execute(q) q = f'insert into {annotator_table} values ("base", "Variant Annotation", "")' self.cursor.execute(q) for _, col_def in self.base_reader.get_all_col_defs().items(): col_name = self.base_prefix + '__' + col_def.name col_def.name = col_name columns.append(col_def) unique_names.add(col_name) for annot_name in self.annotators: reader = self.readers[annot_name] annotator_name = reader.get_annotator_name() if annotator_name == '': annotator_name = annot_name annotator_displayname = reader.get_annotator_displayname() if annotator_displayname == '': annotator_displayname = annotator_name.upper() annotator_version = reader.get_annotator_version() q = f'insert or replace into {annotator_table} values (?, ?, ?)' self.cursor.execute( q, [annotator_name, annotator_displayname, annotator_version]) orded_col_index = sorted(list(reader.get_all_col_defs().keys())) for col_index in orded_col_index: col_def = reader.get_col_def(col_index) reader_col_name = col_def.name if reader_col_name == self.key_name: continue col_def.name = '%s__%s' % (annot_name, reader_col_name) if col_def.name in unique_names and not self.append: err_msg = 'Duplicate column name %s found in %s. ' \ %(col_def.name, reader.path) sys.exit(err_msg) else: columns.append(col_def) unique_names.add(col_def.name) # data table col_def_strings = [] for col_def in columns: name = col_def.name sql_type = self.cr_type_to_sql[col_def.type] s = name + ' ' + sql_type col_def_strings.append(s) if not self.append: q = f'drop table if exists {self.table_name}' self.cursor.execute(q) q = 'create table {} ({});'.format( self.table_name, ', '.join(col_def_strings), ) self.cursor.execute(q) # index tables index_n = 0 # index_columns is a list of columns to include in this index for index_columns in self.base_reader.get_index_columns(): cols = ['base__{0}'.format(x) for x in index_columns] q = 'create index {}_idx_{} on {} ({});'.format( self.table_name, index_n, self.table_name, ', '.join(cols), ) self.cursor.execute(q) index_n += 1 else: q = f'pragma table_info({self.table_name})' self.cursor.execute(q) cur_cols = set([x[1] for x in self.cursor]) for cds in col_def_strings: col_name = cds.split(' ')[0] if col_name in cur_cols: if col_name.startswith('base'): continue q = f'update {self.table_name} set {col_name} = null' else: q = f'alter table {self.table_name} add column {cds}' self.cursor.execute(q) # header table if not self.append: q = f'drop table if exists {self.header_table_name}' self.cursor.execute(q) q = f'create table {self.header_table_name} (col_name text primary key, col_def text);' self.cursor.execute(q) q = f'select col_name, col_def from {self.header_table_name}' self.cursor.execute(q) cdefs = OrderedDict() for cname, cjson in self.cursor: annot_name = cname.split('__')[0] cdefs[cname] = ColumnDefinition(json.loads(cjson)) if cdefs: self.cursor.execute(f'delete from {self.header_table_name}') for cdef in columns: cdefs[cdef.name] = cdef insert_template = f'insert into {self.header_table_name} values (?, ?)' for cdef in cdefs.values(): self.cursor.execute(insert_template, [cdef.name, cdef.get_json()]) # report substitution table if self.level in ['variant', 'gene']: if not self.append: q = f'drop table if exists {self.reportsub_table_name}' self.cursor.execute(q) q = f'create table {self.reportsub_table_name} (module text primary key, subdict text)' self.cursor.execute(q) if hasattr(self.base_reader, 'report_substitution'): sub = self.base_reader.report_substitution if sub: q = f'insert into {self.reportsub_table_name} values ("base", ?)' self.cursor.execute(q, [json.dumps(sub)]) for module in self.readers: if hasattr(self.base_reader, 'report_substitution'): sub = self.readers[module].report_substitution if sub: q = f'insert or replace into {self.reportsub_table_name} values (?, ?)' self.cursor.execute(q, [module, json.dumps(sub)]) self.make_reportsub() # filter and layout save table if not self.append: q = 'drop table if exists viewersetup' self.cursor.execute(q) q = 'create table viewersetup (datatype text, name text, viewersetup text, unique (datatype, name))' self.cursor.execute(q) self.dbconn.commit()