def scan_dirs(self, dirs): for dir in dirs: CGData.log("SCANNING DIR: %s" % (dir)) if os.path.isdir(dir): filePath= os.path.join(dir, "*") else: filePath = dir for path in glob(filePath): if os.path.isfile(path): if path.endswith(".json"): handle = open(path) try: data = json.loads(handle.read()) except ValueError, e: CGData.error("BAD JSON in " + path + " " + str(e) ) data = None handle.close() if (data is not None and 'name' in data and data['name'] is not None and 'type' in data): self.addFile(data['type'], data['name'], path) if path.endswith("*.cgz"): cgzList = CGData.CGZ.list( path ) for type in cgzList: for zPath in cgzList[type]: self.addFile(type, cgzList[type][zPath], zPath, path) if os.path.isdir(path): self.scan_dirs([path])
def gen_sql_heatmap(self, id_table, opts): CGData.log("ClincalTrack SQL " + self.get_name()) features = self.members["clinicalFeature"].features matrix = self.members["clinicalMatrix"] # e.g. { 'HER2+': 'category', ...} explicit_types = dict((f, features[f]['valueType']) for f in features if 'valueType' in features[f]) matrix.feature_type_setup(explicit_types) for a in self.members['clinicalMatrix'].col_list: if a in features and "stateOrder" in features[a]: enums = [x for x in csv.reader(features[a]["stateOrder"], skipinitialspace=True)][0] i = 0 #do not drop states in stateOrder for e in enums: matrix.enum_map[a][e] = enums.index(e) for e in matrix.enum_map[a]: if e in enums: matrix.enum_map[a][e] = enums.index(e) else: matrix.enum_map[a][e] = len(enums) + i i += 1 for a in matrix.gen_sql_heatmap(id_table, features=features): yield a
class SampleMap(CGData.CGDataSetObject): DATA_FORM = CGData.TABLE COLS = [ CGData.Column('node_name', str, primary_key=True), CGData.Column('parent', str), CGData.Column('child', str) ] def __init__(self): CGData.CGDataSetObject.__init__(self) self.mhash = {} def read(self, handle): for line in handle: tmp = line.rstrip().split('\t') if not tmp[0] in self.sample_hash: self.sample_hash[tmp[0]] = {} if len(tmp) > 1: self.sample_hash[tmp[0]][tmp[1]] = True def get_children(self, sample): out = {} for a in self.sample_hash.get(sample, {}): out[a] = True for c in self.get_children(a): out[c] = True return out.keys()
def gen_sql_heatmap(self, id_table, opts): CGData.log("ClincalTrack SQL " + self.get_name()) features = self.members["clinicalFeature"].features matrix = self.members["clinicalMatrix"] # e.g. { 'HER2+': 'category', ...} explicit_types = dict((f, features[f]['valueType']) for f in features if 'valueType' in features[f]) matrix.feature_type_setup(explicit_types) for a in self.members['clinicalMatrix'].col_list: if a in features and "stateOrder" in features[a]: enums = [ x for x in csv.reader(features[a]["stateOrder"], skipinitialspace=True) ][0] i = 0 #do not drop states in stateOrder for e in enums: matrix.enum_map[a][e] = enums.index(e) for e in matrix.enum_map[a]: if e in enums: matrix.enum_map[a][e] = enums.index(e) else: matrix.enum_map[a][e] = len(enums) + i i += 1 for a in matrix.gen_sql_heatmap(id_table, features=features): yield a
def gen_sql(self, id_table): CGData.log("ClincalTrack SQL " + self.get_name()) matrix = self.members["clinicalMatrix"] matrix.feature_type_setup() features = self.members["clinicalFeature"] #print features for a in features: if "stateOrder" in features[a]: #print features[a]["stateOrder"][0] #this weird bit of code is to split on ',', but respect \, #if you can think of a better way, please replace this tmp = re.split(r'([^,]),', features[a]["stateOrder"][0]) enums = [] word = True appending = False e = 0 while e < len(tmp): if word: if appending: enums[-1] += tmp[e] else: enums.append(tmp[e]) word = False else: if tmp[e] != "\\": enums[-1] += tmp[e] appending = False else: enums[-1] += "," appending = True word = True e += 1 #print tmp #print enums #print matrix.enum_map[a] i = 0 for e in matrix.enum_map[a]: if e in enums: matrix.enum_map[a][e] = enums.index(e) else: matrix.enum_map[a][e] = len(enums) + i i += 1 #print matrix.enum_map[a] #print "-=-=-=-=-" for a in matrix.gen_sql(id_table, skip_feature_setup=True): yield a
def gen_sql_heatmap(self, id_table, skip_feature_setup=False): CGData.log( "Writing Clinical %s SQL" % (self.attrs['name'])) if not skip_feature_setup: self.feature_type_setup() table_name = self.attrs['name'] yield "drop table if exists clinical_%s;" % ( table_name ) yield """ CREATE TABLE clinical_%s ( \tsampleID int, \tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) ) for col in self.col_order: if ( self.enum_map.has_key( col ) ): yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]) ) ) else: yield ",\n\t`%s` FLOAT default NULL" % (col.strip()) yield """ ) engine 'MyISAM'; """ for target in sortedSamples(self.row_hash.keys()): a = [] for col in self.orig_order: val = self.row_hash[ target ][ self.col_list[ col ] ] if val is None or val == "null" or len(val) == 0 : a.append("\\N") else: a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" ) yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( table_name + ':sample_id', target ), sql_fix(target), u",".join(a) ) yield "drop table if exists clinical_%s_colDb;" % ( table_name ) yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" ) yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \ ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' ) i = 0; for name in self.col_order: filter = 'coded' if self.enum_map.has_key(name) else 'minMax' yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \ ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off') i += 1
class DataSubType(CGData.CGDataSetObject): DATA_FORM = CGData.TABLE COLS = [ CGData.Column('name', str, primary_key=True), ] def __init__(self): CGData.CGDataSetObject.__init__(self)
def addFile(self, type, name, path, zipFile=None): if CGData.has_type(type): if not type in self: self[type] = {} if name in self[type]: CGData.error("Duplicate %s file %s" % (type, name)) self[type][name] = CGData.light_load(path, zipFile) CGData.log("FOUND: " + type + "\t" + name + "\t" + path) else: CGData.warn("Unknown file type: %s" % (path))
class Assembly(CGData.CGObjectBase): """ Blank Class to represent Genome Assemblies """ DATA_FORM = CGData.TABLE COLS = [ CGData.Column('name', str, primary_key=True), ]
def append(self, probe): for attr in self.child_type.core_attr: if not hasattr(probe, attr): raise CGData.FormatException("Missing %s" % (attr)) if self.chrom_map is None: self.chrom_map = {} if not probe.chrom in self.chrom_map: self.chrom_map[probe.chrom] = {} if not probe.name in self.chrom_map[probe.chrom]: self.chrom_map[probe.chrom][probe.name] = [probe] else: self.chrom_map[probe.chrom][probe.name].append(probe)
def gen_sql(self, id_table): gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb", "genomic_" + table_base + "_alias", table_base, len(gmatrix.get_sample_list()), 'bed 15', gmatrix.attrs[':dataSubType'], 'localDb', 'public', ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for sample in gmatrix.get_sample_list(): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( 'sample_id', sample), sample ) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base ) sample_ids = [] for sample in gmatrix.get_sample_list(): sample_ids.append( str( id_table.get( 'sample_id', sample ) ) ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): exp_ids = ','.join( sample_ids ) row = gmatrix.get_row_vals( probe_name ) exps = ','.join( str(a) for a in row ) probe = pmap.get( probe_name ) if probe is not None: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' );\n" % \ ( "genomic_%s" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, exps ) yield istr else: missingProbeCount += 1 CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def gen_sql(self): if "compiler.mode" in self.params and self.params[ "compiler.mode" ] == "scan": return log( "Writing SQL" ) if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) self.id_table = CGIDTable() for rtype in self.compile_matrix: if issubclass( CGData.get_type( rtype ), CGData.CGSQLObject ): for rname in self.compile_matrix[ rtype ]: shandle = self.compile_matrix[ rtype ][ rname ].gen_sql( self.id_table ) if shandle is not None: ohandle = open( os.path.join( self.out_dir, "%s.%s.sql" % (rtype, rname ) ), "w" ) for line in shandle: ohandle.write( line ) ohandle.close() #tell the object to unload data, so we don't continually allocate over the compile self.compile_matrix[ rtype ][ rname ].unload()
def gen_sql_heatmap(self, id_table, features=None): CGData.log( "Writing Clinical %s SQL" % (self['name'])) if features == None: self.feature_type_setup() features = {} features['sampleName'] = { 'shortTitle': ['Sample name'], 'longTitle': ['Sample name'], 'visibility': ['on'], 'priority': [100.0] } table_name = self['name'].replace(".","_") clinical_table = 'clinical_' + table_name yield "DROP TABLE IF EXISTS %s;\n" % ( clinical_table ) yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table # colDb i = 0; for name in self.col_order: shortLabel = name if name not in features or 'shortTitle' not in features[name] else features[name]['shortTitle'][0] longLabel = name if name not in features or 'longTitle' not in features[name] else features[name]['longTitle'][0] filter = 'coded' if self.enum_map.has_key(name) else 'minMax' visibility = ('on' if i < 10 else 'off') if name not in features or 'visibility' not in features[name] else features[name]['visibility'][0] priority = 1 if name not in features or 'priority' not in features[name] else float(features[name]['priority'][0]) yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \ ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority) yield "SET @col%d=LAST_INSERT_ID();\n" % i i += 1 # codes i = 0; values = {} for col in self.col_order: if ( self.enum_map.has_key( col ) ): values[col] = {} j = 0 for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]): yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % (i, j, sql_fix(a), i, j) values[col][a] = "@val%d_%d" % (i, j) j += 1 i += 1 yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table for col in self.col_order: if col == 'sampleName': yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE" else: if self.enum_map.has_key(col): yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip()) else: yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip()) yield """ ) engine 'MyISAM'; """ for target in sortedSamples(self.row_hash.keys()): a = [] for col,orig in zip(self.col_order, self.orig_order): if col == 'sampleName': val = target else: val = self.row_hash[ target ][ self.col_list[ orig ] ] if val is None or val.upper() in NULL_VALUES: a.append("\\N") else: if col in self.enum_map: a.append(values[col][val]) else: a.append(val) yield u"INSERT INTO %s VALUES ( %d, %s );\n" % ( clinical_table, id_table.get( table_name + ':sample_id', target ), u",".join(a) )
def link_objects(self): """ Scan found object records and determine if the data they link to is avalible """ omatrix = {} for otype in self.set_hash: if issubclass( CGData.get_type( otype ), CGData.CGGroupMember ): gmap = {} for oname in self.set_hash[ otype ]: oobj = self.set_hash[ otype ][ oname ] if oobj.get_group() not in gmap: if issubclass(CGData.get_type(otype), CGData.CGGroupMemberSQL): gmap[ oobj.get_group() ] = CGData.CGGroupBaseSQL( oobj.get_group() ) else: gmap[ oobj.get_group() ] = CGData.CGGroupBase( oobj.get_group() ) gmap[ oobj.get_group() ].put( oobj ) omatrix[ otype ] = gmap else: omatrix[ otype ] = self.set_hash[ otype ] # Now it's time to check objects for their dependencies ready_matrix = {} for stype in omatrix: for sname in omatrix[ stype ]: sobj = omatrix[ stype ][ sname ] lmap = sobj.get_link_map() is_ready = True for ltype in lmap: if not omatrix.has_key( ltype ): warn( "%s missing data type %s" % (sname, ltype) ) is_ready = False else: for lname in lmap[ ltype ]: if not omatrix[ltype].has_key( lname ): warn( "%s %s missing data %s %s" % ( stype, sname, ltype, lname ) ) is_ready = False if not sobj.is_link_ready(): warn( "%s %s not LinkReady" % ( stype, sname ) ) elif is_ready: if not stype in ready_matrix: ready_matrix[ stype ] = {} ready_matrix[ stype ][ sname ] = sobj for rtype in ready_matrix: log( "READY %s: %s" % ( rtype, ",".join(ready_matrix[rtype].keys()) ) ) for dType in ready_matrix: log("Found %s %d" % (dType, len(ready_matrix[dType]))) merge_children = {} for merge_type in CGData.MERGE_OBJECTS: mtype = CGData.get_type( merge_type ) select_types = mtype.typeSet select_set = {} try: for stype in select_types: select_set[ stype ] = ready_matrix[ stype ] if stype not in merge_children: merge_children[stype] = {} except KeyError: error("missing data type %s" % (stype) ) continue mobjlist = self.set_enumerate( mtype, select_set ) for mobj in mobjlist: if merge_type not in ready_matrix: ready_matrix[ merge_type ] = {} for cType in mobj: merge_children[cType][mobj[cType].get_name()] = True ready_matrix[ merge_type ][ mobj.get_name() ] = mobj self.compile_matrix = {} for sType in ready_matrix: self.compile_matrix[sType] = {} for name in ready_matrix[sType]: if sType not in merge_children or name not in merge_children[sType]: self.compile_matrix[sType][name] = ready_matrix[sType][name] log("After Merge") for dType in ready_matrix: log("Found %s %d" % (dType, len(self.compile_matrix[dType])))
#!/usr/bin/env python #note: this script assumes that the first alias in the probeMap aliaslist #is a HUGO gene name... import CGData import sys # matrixProbeRemap.py <matrixFile> <probeFile> #load the matrix matrix = CGData.load(sys.argv[1]) #load the probeMap probeMap = CGData.load(sys.argv[2]) #remove null probes from the matrix matrix.remove_null_probes() #remap the matrix using the probe map matrix.remap(probeMap, skip_missing=True) matrix.add_history("Transformed from probespace %s to HUGO" % (probeMap.get_name())) matrix.attrs[":probeMap"] = "hugo" #output the matrix matrix.store(sys.argv[3])
import CGData import CGData.NumpyMatrix import sys # matrixProbeRemap.py <matrixFile> <probeFile> #load the matrix matrix = CGData.NumpyMatrix.NumpyMatrix() matrixHandle = open(sys.argv[1]) matrix.read(matrixHandle) matrixHandle.close() #load the probeMap probeMap = CGData.load( sys.argv[2] ) #remove null probes from the matrix matrix.remove_null_probes() #remap the matrix using the probe map valid_map = {} for alt in probeMap.get_probes(): valid_map[alt.aliases[0]] = True if alt.name in matrix.get_row_names(): matrix.row_rename(alt.name, alt.aliases[0]) remove_list = [] for name in matrix.get_row_names(): if not name in valid_map:
def gen_sql_heatmap(self, id_table): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix.attrs['shortTitle']), sql_fix(gmatrix.attrs['longTitle']), len(gmatrix.get_sample_list()), self.format, gmatrix.attrs[':dataSubType'], 'localDb', 'public', ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sample ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for pset in pmap: for probe in pset: for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.get( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "create table genomic_%s like genomic_%s_tmp;" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;" % (table_base, table_base) yield "drop table genomic_%s_tmp;" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
#!/usr/bin/env python #note: this script assumes that the first alias in the probeMap aliaslist #is a HUGO gene name... import CGData import sys # matrixProbeRemap.py <matrixFile> <probeFile> #load the matrix matrix = CGData.load( sys.argv[1] ) #load the probeMap probeMap = CGData.load( sys.argv[2] ) #remove null probes from the matrix matrix.remove_null_probes() #remap the matrix using the probe map matrix.remap( probeMap, skip_missing=True ) matrix.add_history( "Transformed from probespace %s to HUGO" % (probeMap.get_name() ) ) matrix.attrs[":probeMap"] = "hugo" #output the matrix matrix.store( sys.argv[3] )
def link_objects(self): """ Scan found object records and determine if the data they link to is avalible """ omatrix = {} for otype in self.set_hash: if issubclass(CGData.get_type(otype), CGData.CGGroupMember): gmap = {} for oname in self.set_hash[otype]: oobj = self.set_hash[otype][oname] if oobj.get_group() not in gmap: gmap[oobj.get_group()] = CGData.CGGroupBase( oobj.get_group()) gmap[oobj.get_group()].put(oobj) omatrix[otype] = gmap else: omatrix[otype] = self.set_hash[otype] # Now it's time to check objects for their dependencies ready_matrix = {} for stype in omatrix: for sname in omatrix[stype]: sobj = omatrix[stype][sname] lmap = sobj.get_link_map() is_ready = True for ltype in lmap: if not omatrix.has_key(ltype): warn("%s missing data type %s" % (sname, ltype)) is_ready = False else: for lname in lmap[ltype]: if not omatrix[ltype].has_key(lname): warn("%s %s missing data %s %s" % (stype, sname, ltype, lname)) is_ready = False if not sobj.is_link_ready(): warn("%s %s not LinkReady" % (stype, sname)) elif is_ready: if not stype in ready_matrix: ready_matrix[stype] = {} ready_matrix[stype][sname] = sobj for rtype in ready_matrix: log("READY %s: %s" % (rtype, ",".join(ready_matrix[rtype].keys()))) for dType in ready_matrix: log("Found %s %d" % (dType, len(ready_matrix[dType]))) merge_children = {} for merge_type in CGData.MERGE_OBJECTS: mtype = CGData.get_type(merge_type) select_types = mtype.typeSet select_set = {} try: for stype in select_types: select_set[stype] = ready_matrix[stype] if stype not in merge_children: merge_children[stype] = {} except KeyError: error("missing data type %s" % (stype)) continue mobjlist = self.set_enumerate(mtype, select_set) for mobj in mobjlist: if merge_type not in ready_matrix: ready_matrix[merge_type] = {} for cType in mobj: merge_children[cType][mobj[cType].get_name()] = True ready_matrix[merge_type][mobj.get_name()] = mobj self.compile_matrix = {} for sType in ready_matrix: self.compile_matrix[sType] = {} for name in ready_matrix[sType]: if sType not in merge_children or name not in merge_children[ sType]: self.compile_matrix[sType][name] = ready_matrix[sType][ name] log("After Merge") for dType in ready_matrix: log("Found %s %d" % (dType, len(self.compile_matrix[dType])))
def gen_sql_heatmap(self, id_table): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name() other = {} for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']: if attr in gmatrix: other[attr] = gmatrix[attr] if 'dataProducer' in gmatrix: other['author_list'] = gmatrix['dataProducer'] if 'articleTitle' in gmatrix: other['article_title'] = gmatrix['articleTitle'] other['version'] = gmatrix.get('version', "") datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception if 'owner' in gmatrix: other['owner'] = gmatrix['owner'] other['colNormalization'] = gmatrix.get('colNormalization', False) if not isinstance(other['colNormalization'], bool): other['colNormalization'] = False other['redistribution'] = gmatrix.get('redistribution', False) if not isinstance(other['redistribution'], bool): other['redistribution'] = False other['security'] = gmatrix.get('security', "public") if other['security'] not in [ "public", "private" ]: other['security'] = "public" yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \ ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix['shortTitle']), sql_fix(gmatrix['longTitle']), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'], 'localDb', 'public', float(gmatrix.get('priority', 1.0)), float(gmatrix.get('gain', 1.0)), sql_fix(gmatrix.get('groupTitle', 'Misc.')), "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N', sql_fix(json.dumps(other)), ) # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
#!/usr/bin/env python import sys import CGData import CGData.Compiler c = CGData.Compiler.BrowserCompiler() c.scan_dirs(sys.argv[1:]) linkSpace = {} for type in c.set_hash: if issubclass( CGData.get_type( type ), CGData.CGDataMatrixObject ): for name in c.set_hash[type]: current = "%s:%s" % (type,name) x_link = c.set_hash[type][name].get_x_namespace() y_link = c.set_hash[type][name].get_y_namespace() if x_link is not None: if x_link not in linkSpace: linkSpace[x_link] = {} linkSpace[x_link][current] = True print "%s x_link %s" % (current, x_link) if y_link is not None: if y_link not in linkSpace: linkSpace[y_link] = {} linkSpace[y_link][current] = True print "%s y_link %s" % (current, y_link) for ns in linkSpace: s = linkSpace[ns].keys()
def gen_sql(self, id_table, skip_feature_setup=False): CGData.log( "Gen %s SQL" % (self.attrs['name'])) if not skip_feature_setup: self.feature_type_setup() table_name = self.attrs['name'] yield "drop table if exists clinical_%s;" % ( table_name ) yield """ CREATE TABLE clinical_%s ( \tsampleID int, \tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) ) for col in self.col_order: if ( self.enum_map.has_key( col ) ): yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in self.enum_map[ col ].keys() ) ) else: yield ",\n\t`%s` FLOAT default NULL" % (col.strip()) yield """ ) engine 'MyISAM'; """ for target in sortedSamples(self.row_hash.keys()): a = [] for col in self.orig_order: val = self.row_hash[ target ][ self.col_list[ col ] ] #print target, col, val if val is None or val == "null" or len(val) == 0 : a.append("\\N") else: #a.append( "'" + sql_fix(val) + "'" ) a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" ) yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( 'sample_id', target ), sql_fix(target), u",".join(a) ) #yield u"INSERT INTO clinical_%s VALUES ( %d, %s );\n" % ( table_name, id_table.get( 'sample_id', target ), u",".join(a) ) yield "drop table if exists clinical_%s_colDb;" % ( table_name ) yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" ) """ `id` int(10) unsigned NOT NULL default '0', `name` varchar(255) default NULL, `shortLabel` varchar(255) default NULL, `longLabel` varchar(255) default NULL, `valField` varchar(255) default NULL, `clinicalTable` varchar(255) default NULL, `priority` float default NULL, `filterType` varchar(255) default NULL, `visibility` varchar(255) default NULL, `groupName` varchar(255) default NULL, PRIMARY KEY (`id`), KEY `name` (`name`) """ yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \ ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' ) i = 0; for name in self.col_order: filter = 'coded' if self.enum_map.has_key(name) else 'minMax' yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \ ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off') i += 1
def gen_sql_heatmap(self, id_table, opts): #scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members[ 'genomicMatrix' ] pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() )) return savedownsample = 'save-ds' in opts and opts['save-ds'] table_base = self.get_name().replace(".", "_") CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members[ "clinicalMatrix" ].get_name().replace(".", "_") other = {} for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']: if attr in gmatrix: other[attr] = gmatrix[attr] if 'dataProducer' in gmatrix: other['author_list'] = gmatrix['dataProducer'] if 'articleTitle' in gmatrix: other['article_title'] = gmatrix['articleTitle'] ##TO DO, the version info should be the lastest of genomic and clinical, currently only check genomic cVersion= self.members[ 'clinicalMatrix' ].get('version',"") gVersion= self.members[ 'genomicMatrix' ].get('version',"") dG= makeDate(gVersion) dC= makeDate(cVersion) if dC == None: other['version'] = gVersion elif dG<dC: other['version'] = cVersion else: other['version'] = gVersion datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception if 'owner' in gmatrix: other['owner'] = gmatrix['owner'] other['colNormalization'] = gmatrix.get('colNormalization', False) if not isinstance(other['colNormalization'], bool): other['colNormalization'] = False other['redistribution'] = gmatrix.get('redistribution', False) if not isinstance(other['redistribution'], bool): other['redistribution'] = False security = gmatrix.get('security', "public") if security not in [ "public", "private" ]: security = "public" if savedownsample: yield "SET @ds=(SELECT downSampleTable FROM raDb WHERE name = '%s');\n" % ("genomic_" + table_base) yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, downSampleTable, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', %s, '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \ ( "genomic_" + table_base, "@ds" if savedownsample else "NULL", "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix['shortTitle']), sql_fix(gmatrix['longTitle']), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'], 'localDb', security, float(gmatrix.get('priority', 1.0)), float(gmatrix.get('gain', 1.0)), sql_fix(gmatrix.get('groupTitle', 'Misc.')), "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N', "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N', sql_fix(json.dumps(other)), ) if 'no-genomic-matrix' in opts and opts['no-genomic-matrix']: return # write out the sample table yield "drop table if exists sample_%s;" % ( table_base ) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) ) yield "drop table if exists genomic_%s_alias;" % ( table_base ) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias)) # write out the BED table yield "drop table if exists %s;" % ( "genomic_" + table_base ) yield CREATE_BED % ( "genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0])) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) ) exp_ids = ','.join( sample_ids ) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals( probe_name ) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup( probe_name ) if pset is not None: for probe in pset: istr = "insert into %s(bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \ ( "genomic_%s_tmp" % (table_base), Binner.calcBin(probe.chrom_start, probe.chrom_end), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s(bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) select bin, chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def gen_sql_heatmap(self, id_table, features=None): CGData.log("Writing Clinical %s SQL" % (self['name'])) if features == None: self.feature_type_setup() features = {} features['sampleName'] = { 'shortTitle': ['Sample name'], 'longTitle': ['Sample name'], 'visibility': ['on'], 'priority': [1] } table_name = self['name'] clinical_table = 'clinical_' + table_name yield "DROP TABLE IF EXISTS %s;\n" % (clinical_table) yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table # colDb i = 0 for name in self.col_order: shortLabel = name if name not in features or 'shortTitle' not in features[ name] else features[name]['shortTitle'][0] longLabel = name if name not in features or 'longTitle' not in features[ name] else features[name]['longTitle'][0] filter = 'coded' if self.enum_map.has_key(name) else 'minMax' visibility = ( 'on' if i < 10 else 'off') if name not in features or 'visibility' not in features[ name] else features[name]['visibility'][0] priority = 1 if name not in features or 'priority' not in features[ name] else float(features[name]['priority'][0]) yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \ ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority) yield "SET @col%d=LAST_INSERT_ID();\n" % i i += 1 # codes i = 0 values = {} for col in self.col_order: if (self.enum_map.has_key(col)): values[col] = {} j = 0 for a in sorted( self.enum_map[col].keys(), lambda x, y: self.enum_map[ col][x] - self.enum_map[col][y]): yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % ( i, j, sql_fix(a), i, j) values[col][a] = "@val%d_%d" % (i, j) j += 1 i += 1 yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table for col in self.col_order: if col == 'sampleName': yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE" else: if self.enum_map.has_key(col): yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip()) else: yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip()) yield """ ) engine 'MyISAM'; """ for target in sortedSamples(self.row_hash.keys()): a = [] for col, orig in zip(self.col_order, self.orig_order): if col == 'sampleName': val = target else: val = self.row_hash[target][self.col_list[orig]] if val is None or val.upper() in NULL_VALUES: a.append("\\N") else: if col in self.enum_map: a.append(values[col][val]) else: a.append(val) yield u"INSERT INTO %s VALUES ( %d, %s );\n" % ( clinical_table, id_table.get(table_name + ':sample_id', target), u",".join(a))
import CGData import CGData.NumpyMatrix import sys # matrixProbeRemap.py <matrixFile> <probeFile> #load the matrix matrix = CGData.NumpyMatrix.NumpyMatrix() matrixHandle = open(sys.argv[1]) matrix.read(matrixHandle) matrixHandle.close() #load the probeMap probeMap = CGData.load(sys.argv[2]) #remove null probes from the matrix matrix.remove_null_probes() #remap the matrix using the probe map valid_map = {} for alt in probeMap.get_probes(): valid_map[alt.aliases[0]] = True if alt.name in matrix.get_row_names(): matrix.row_rename(alt.name, alt.aliases[0]) remove_list = [] for name in matrix.get_row_names(): if not name in valid_map:
def gen_sql_heatmap(self, id_table): # scan the children # XXX Handling of sql for children is broken if the child may appear # as part of multiple merge objects, such as TrackGenomic and TrackClinical. # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling # it for gen_sql. clinical = self.members.pop("clinicalMatrix") for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"): yield line self.members["clinicalMatrix"] = clinical gmatrix = self.members["genomicMatrix"] pmap = self.members["probeMap"].lookup(assembly="hg18") # BUG: hard coded to only producing HG18 tables if pmap is None: CGData.error("Missing HG18 %s" % (self.members["probeMap"].get_name())) return table_base = self.get_name() CGData.log("Writing Track %s" % (table_base)) clinical_table_base = self.members["clinicalMatrix"].get_name() other = {} for attr in ["wrangler", "wrangling_procedure", "url", "citation", "description"]: if attr in gmatrix: other[attr] = gmatrix[attr] if "dataProducer" in gmatrix: other["author_list"] = gmatrix["dataProducer"] if "articleTitle" in gmatrix: other["article_title"] = gmatrix["articleTitle"] other["version"] = gmatrix.get("version", "") datetime.datetime.strptime( other["version"], "%Y-%m-%d" ) # if the version isn't properly formatted, though exception if "owner" in gmatrix: other["owner"] = gmatrix["owner"] other["colNormalization"] = gmatrix.get("colNormalization", False) if not isinstance(other["colNormalization"], bool): other["colNormalization"] = False other["redistribution"] = gmatrix.get("redistribution", False) if not isinstance(other["redistribution"], bool): other["redistribution"] = False other["security"] = gmatrix.get("security", "public") if other["security"] not in ["public", "private"]: other["security"] = "public" yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base) yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % ( "genomic_" + table_base, "sample_" + table_base, "clinical_" + clinical_table_base, "colDb", "genomic_" + table_base + "_alias", sql_fix(gmatrix["shortTitle"]), sql_fix(gmatrix["longTitle"]), len(gmatrix.get_sample_list()), self.format, dataSubTypeMap[gmatrix[":dataSubType"]] if gmatrix[":dataSubType"] in dataSubTypeMap else gmatrix[":dataSubType"], "localDb", "public", float(gmatrix.get("priority", 1.0)), float(gmatrix.get("gain", 1.0)), sql_fix(gmatrix.get("groupTitle", "Misc.")), "'%s'" % sql_fix(gmatrix["wrangler"]) if "wrangler" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["url"]) if "url" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["articleTitle"]) if "articleTitle" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["citation"]) if "citation" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["dataProducer"]) if "dataProducer" in gmatrix else "\N", "'%s'" % sql_fix(gmatrix["wrangling_procedure"]) if "wrangling_procedure" in gmatrix else "\N", sql_fix(json.dumps(other)), ) # write out the sample table yield "drop table if exists sample_%s;" % (table_base) yield """ CREATE TABLE sample_%s ( id int, sampleName varchar(255) ) engine 'MyISAM'; """ % ( table_base ) from CGData.ClinicalMatrix import sortedSamples for sample in sortedSamples(gmatrix.get_sample_list()): yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get(clinical_table_base + ":sample_id", sample), sql_fix(sample), ) yield "drop table if exists genomic_%s_alias;" % (table_base) yield """ CREATE TABLE genomic_%s_alias ( name varchar(255), alias varchar(255) ) engine 'MyISAM'; """ % ( table_base ) for probe in pmap.get_probes(): for alias in probe.aliases: yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % ( table_base, sql_fix(probe.name), sql_fix(alias), ) # write out the BED table yield "drop table if exists %s;" % ("genomic_" + table_base) yield CREATE_BED % ("genomic_" + table_base + "_tmp") sample_ids = [] samples = gmatrix.get_sample_list() # sort samples by sample_id, and retain the sort order for application to the genomic data, below tmp = sorted( zip(samples, range(len(samples))), cmp=lambda x, y: id_table.get(clinical_table_base + ":sample_id", x[0]) - id_table.get(clinical_table_base + ":sample_id", y[0]), ) samples, order = map(lambda t: list(t), zip(*tmp)) for sample in samples: sample_ids.append(str(id_table.get(clinical_table_base + ":sample_id", sample))) exp_ids = ",".join(sample_ids) missingProbeCount = 0 for probe_name in gmatrix.get_probe_list(): # get the genomic data and rearrange to match the sample_id order tmp = gmatrix.get_row_vals(probe_name) row = map(lambda i: tmp[order[i]], range(len(tmp))) pset = pmap.lookup(probe_name) if pset is not None: for probe in pset: istr = ( "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start - 1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row), ) ) yield istr else: missingProbeCount += 1 yield "# sort file by chrom position\n" yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base) yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % ( table_base, table_base, ) yield "drop table genomic_%s_tmp;\n" % table_base CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
class ProbeMap(CGData.CGDataSetObject, CGData.CGGroupMember): child_type = Probe DATA_FORM = CGData.TABLE COLS = [ CGData.Column('name', str, primary_key=True), CGData.Column('chrom', str), CGData.Column('chrom_start', str), CGData.Column('chrom_end', int), CGData.Column('strand', str) ] def __init__(self): CGData.CGDataSetObject.__init__(self) self.gene_map = None self.chrom_map = None def read(self, handle): self.gene_map = {} self.chrom_map = {} read = csv.reader(handle, delimiter="\t") for line in read: self.gene_map[line[0]] = line[1].split(',') try: self.append( Probe(line[0], line[2], int(line[3]), int(line[4]), line[5], self.gene_map[line[0]])) except ValueError: """location int conversion failed, ignore silently""" pass def append(self, probe): for attr in self.child_type.core_attr: if not hasattr(probe, attr): raise CGData.FormatException("Missing %s" % (attr)) if self.chrom_map is None: self.chrom_map = {} if not probe.chrom in self.chrom_map: self.chrom_map[probe.chrom] = {} if not probe.name in self.chrom_map[probe.chrom]: self.chrom_map[probe.chrom][probe.name] = [probe] else: self.chrom_map[probe.chrom][probe.name].append(probe) def write(self, handle): for chrom in self.chrom_map: for probeName in self.chrom_map[chrom]: probes = self.chrom_map[chrom][probeName] for probe in probes: handle.write("%s\n" % ("\t".join([ probe.name, ",".join(probe.aliases), probe.chrom, str(probe.chrom_start), str(probe.chrom_end), probe.strand ]))) # XXX need a better name. What does this return? def lookup(self, item): if self.gene_map is None: self.load() for chrome in self.chrom_map: if item in self.chrom_map[chrome]: return self.chrom_map[chrome][item] return None def row_iter(self): if self.gene_map is None: self.load() for chrome in self.chrom_map: for probe in self.chrom_map[chrome]: pset = self.chrom_map[chrome][probe] for p in pset: yield (p.name, p.chrom, p.chrom_start, p.chrom_end, p.strand) # XXX I have no idea what this is returning. What is a pset? def get_probes(self): if self.gene_map is None: self.load() for chrome in self.chrom_map: for probeSet in self.chrom_map[chrome]: for probe in self.chrom_map[chrome][probeSet]: yield probe