def scan_dirs(self, dirs): for dir in dirs: CGData.log("SCANNING DIR: %s" % (dir)) if os.path.isdir(dir): filePath= os.path.join(dir, "*") else: filePath = dir for path in glob(filePath): if os.path.isfile(path): if path.endswith(".json"): handle = open(path) try: data = json.loads(handle.read()) except ValueError, e: CGData.error("BAD JSON in " + path + " " + str(e) ) data = None handle.close() if (data is not None and 'name' in data and data['name'] is not None and 'type' in data): self.addFile(data['type'], data['name'], path) if path.endswith("*.cgz"): cgzList = CGData.CGZ.list( path ) for type in cgzList: for zPath in cgzList[type]: self.addFile(type, cgzList[type][zPath], zPath, path) if os.path.isdir(path): self.scan_dirs([path])
def gen_sql(self, id_table): gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb", "genomic_" + table_base + "_alias", table_base, len(gmatrix.get_sample_list()), 'bed 15', gmatrix.attrs[':dataSubType'], 'localDb', 'public', ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for sample in gmatrix.get_sample_list(): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( 'sample_id', sample), sample ) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base ) sample_ids = [] for sample in gmatrix.get_sample_list(): sample_ids.append( str( id_table.get( 'sample_id', sample ) ) ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): exp_ids = ','.join( sample_ids ) row = gmatrix.get_row_vals( probe_name ) exps = ','.join( str(a) for a in row ) probe = pmap.get( probe_name ) if probe is not None: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' );\n" % \ ( "genomic_%s" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, exps ) yield istr else: missingProbeCount += 1 CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def addFile(self, type, name, path, zipFile=None): if CGData.has_type(type): if not type in self: self[type] = {} if name in self[type]: CGData.error("Duplicate %s file %s" % (type, name)) self[type][name] = CGData.light_load(path, zipFile) CGData.log("FOUND: " + type + "\t" + name + "\t" + path) else: CGData.warn("Unknown file type: %s" % (path))
def gen_sql_heatmap(self, id_table): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix.attrs['shortTitle']), sql_fix(gmatrix.attrs['longTitle']), len(gmatrix.get_sample_list()), self.format, gmatrix.attrs[':dataSubType'], 'localDb', 'public', ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sample ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for pset in pmap: for probe in pset: for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.get( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "create table genomic_%s like genomic_%s_tmp;" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;" % (table_base, table_base) yield "drop table genomic_%s_tmp;" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def gen_sql_heatmap(self, id_table): # scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members["genomicMatrix"] pmap = self.members["probeMap"].lookup(assembly="hg18") # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % (self.members["probeMap"].get_name())) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members["clinicalMatrix"].get_name() other = {} for attr in ["wrangler", "wrangling_procedure", "url", "citation", "description"]: if attr in gmatrix: other[attr] = gmatrix[attr] if "dataProducer" in gmatrix: other["author_list"] = gmatrix["dataProducer"] if "articleTitle" in gmatrix: other["article_title"] = gmatrix["articleTitle"] other["version"] = gmatrix.get("version", "") datetime.datetime.strptime( other["version"], "%Y-%m-%d" ) # if the version isn't properly formatted, though exception if "owner" in gmatrix: other["owner"] = gmatrix["owner"] other["colNormalization"] = gmatrix.get("colNormalization", False) if not isinstance(other["colNormalization"], bool): other["colNormalization"] = False other["redistribution"] = gmatrix.get("redistribution", False) if not isinstance(other["redistribution"], bool): other["redistribution"] = False other["security"] = gmatrix.get("security", "public") if other["security"] not in ["public", "private"]: other["security"] = "public" yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix["shortTitle"]), sql_fix(gmatrix["longTitle"]), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[":dataSubType"]] if gmatrix[":dataSubType"] in dataSubTypeMap else gmatrix[":dataSubType"], "localDb", "public", float(gmatrix.get("priority", 1.0)), float(gmatrix.get("gain", 1.0)), sql_fix(gmatrix.get("groupTitle", "Misc.")), "'%s'" % sql_fix(gmatrix["wrangler"]) if "wrangler" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["url"]) if "url" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["articleTitle"]) if "articleTitle" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["citation"]) if "citation" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["dataProducer"]) if "dataProducer" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["wrangling_procedure"]) if "wrangling_procedure" in gmatrix else "\N", sql_fix(json.dumps(other)), ) # write out the sample table yield "drop table if exists sample_%s;" % (table_base) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get(clinical_table_base + ":sample_id", sample), sql_fix(sample), ) yield "drop table if exists genomic_%s_alias;" % (table_base) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % ( table_base, sql_fix(probe.name), sql_fix(alias), ) # write out the BED table yield "drop table if exists %s;" % ("genomic_" + table_base) yield CREATE_BED % ("genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp = sorted( zip(samples, range(len(samples))), cmp=lambda x, y: id_table.get(clinical_table_base + ":sample_id", x[0]) - id_table.get(clinical_table_base + ":sample_id", y[0]), ) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append(str(id_table.get(clinical_table_base + ":sample_id", sample))) exp_ids = ",".join(sample_ids) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals(probe_name) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup(probe_name) if pset is not None: for probe in pset: istr = ( "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start - 1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row), ) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % ( table_base, table_base, ) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def gen_sql_heatmap(self, id_table): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() other = {} for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']: if attr in gmatrix: other[attr] = gmatrix[attr] if 'dataProducer' in gmatrix: other['author_list'] = gmatrix['dataProducer'] if 'articleTitle' in gmatrix: other['article_title'] = gmatrix['articleTitle'] other['version'] = gmatrix.get('version', "") datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception if 'owner' in gmatrix: other['owner'] = gmatrix['owner'] other['colNormalization'] = gmatrix.get('colNormalization', False) if not isinstance(other['colNormalization'], bool): other['colNormalization'] = False other['redistribution'] = gmatrix.get('redistribution', False) if not isinstance(other['redistribution'], bool): other['redistribution'] = False other['security'] = gmatrix.get('security', "public") if other['security'] not in [ "public", "private" ]: other['security'] = "public" yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix['shortTitle']), sql_fix(gmatrix['longTitle']), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'], 'localDb', 'public', float(gmatrix.get('priority', 1.0)), float(gmatrix.get('gain', 1.0)), sql_fix(gmatrix.get('groupTitle', 'Misc.')), "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N', sql_fix(json.dumps(other)), ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def gen_sql_heatmap(self, id_table, opts): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return savedownsample = 'save-ds' in opts and opts['save-ds'] table_base = self.get_name().replace(".", "_") CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name().replace(".", "_") other = {} for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']: if attr in gmatrix: other[attr] = gmatrix[attr] if 'dataProducer' in gmatrix: other['author_list'] = gmatrix['dataProducer'] if 'articleTitle' in gmatrix: other['article_title'] = gmatrix['articleTitle'] ##TO DO, the version info should be the lastest of genomic and clinical, currently only check genomic cVersion= self.members[ 'clinicalMatrix' ].get('version',"") gVersion= self.members[ 'genomicMatrix' ].get('version',"") dG= makeDate(gVersion) dC= makeDate(cVersion) if dC == None: other['version'] = gVersion elif dG<dC: other['version'] = cVersion else: other['version'] = gVersion datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception if 'owner' in gmatrix: other['owner'] = gmatrix['owner'] other['colNormalization'] = gmatrix.get('colNormalization', False) if not isinstance(other['colNormalization'], bool): other['colNormalization'] = False other['redistribution'] = gmatrix.get('redistribution', False) if not isinstance(other['redistribution'], bool): other['redistribution'] = False security = gmatrix.get('security', "public") if security not in [ "public", "private" ]: security = "public" if savedownsample: yield "SET @ds=(SELECT downSampleTable FROM raDb WHERE name = '%s');\n" % ("genomic_" + table_base) yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, downSampleTable, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', %s, '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \ ( "genomic_" + table_base, "@ds" if savedownsample else "NULL", "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix['shortTitle']), sql_fix(gmatrix['longTitle']), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'], 'localDb', security, float(gmatrix.get('priority', 1.0)), float(gmatrix.get('gain', 1.0)), sql_fix(gmatrix.get('groupTitle', 'Misc.')), "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N', sql_fix(json.dumps(other)), ) if 'no-genomic-matrix' in opts and opts['no-genomic-matrix']: return # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), Binner.calcBin(probe.chrom_start, probe.chrom_end), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s(bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) select bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))