def __pslCheck(self): """Runs pslCheck program on this table and sends result to reporter's filehandle.""" self.reporter.beginStep(self.db, self.table, "pslCheck") # Get baseColorUseSequence setting tdbCommand = ["tdbQuery", "select baseColorUseSequence from " + self.db +\ " where track='" + self.table +"' or table='"+ self.table +"'"] tdbOut, tdbErr = qaUtils.runCommand(tdbCommand) # Turn tdbOut into array so we can extract different pieces tdbOut = tdbOut.replace("\n", "") tdbOut = tdbOut.split(" ") # tdbOut[1] should be what option is used for baseColorUseSequence # We want to fully run pslCheck for those entries with "table" and "extFile" if len(tdbOut) > 2: if tdbOut[1] == "table": # Get sizes of sequences in specified table sizesOut = qaUtils.callHgsql( self.db, "select name, length(seq) from " + tdbOut[2]) elif tdbOut[1] == "extFile": # Extract sizes from named seq table sizesOut = qaUtils.callHgsql( self.db, "select acc, size from " + tdbOut[2]) # Write itemSizes to file itemSizes = open("%s.sizes" % self.table, 'w') itemSizes.write(sizesOut) itemSizes.close() # Check if $db.sizes file exists if not os.path.isfile("%s.chrom.sizes" % self.db): # If it doens't exist, get chrom sizes from chromInfo chromSizesOut = qaUtils.callHgsql(self.db, "select * from chromInfo") chromSizes = open("%s.chrom.sizes" % self.db, 'w') chromSizes.write(chromSizesOut) chromSizes.close() # Run more in-depth version of pslCheck command = ("pslCheck", "-querySizes=%s.sizes" % self.table ,\ "-targetSizes=%s.chrom.sizes" % self.db, "-db=" + self.db, self.table) self.reporter.writeCommand(command) commandOut, commandErr, commandReturnCode = qaUtils.runCommandNoAbort( command) # Write output to file self.reporter.fh.write(commandErr) # Clean up intermediate item sizes file os.remove("%s.sizes" % self.table) # For everything else, use generic set of steps else: command = ["pslCheck", "db=" + self.db, self.table] self.reporter.writeCommand(command) commandOut, commandErr, commandReturnCode = qaUtils.runCommandNoAbort( command) self.reporter.fh.write(commandErr) if commandReturnCode: self.recordError() else: self.recordPass() self.reporter.endStep()
def checkGapOverlap(db, table, checkUnbridged=False): """Check if any items in a table over lap with bridged and unbridged gaps""" tableBedTemp = makeBedFromTable(db, table) gapOverlapOut = "" gapTableList = checkSplit(db, "gap") gapFileName = str(db) + ".gap.bed.temp" gapFile = open(gapFileName, 'w') for tbl in gapTableList: hgsqlGapOut = qaUtils.callHgsql( db, "select chrom, chromStart, chromEnd from " + tbl) gapFile.write(hgsqlGapOut) gapFile.close() gapOverFile = str(db) + ".gapOver.bed" bedIntCmd = [ "bedIntersect", "-aHitAny", gapFileName, tableBedTemp.name, gapOverFile ] qaUtils.runCommand(bedIntCmd) gapOverUrls = constructOutputUrls(db, table, gapOverFile) gapOverlapOut += gapOverUrls # Remove intermediate files os.remove(gapOverFile) os.remove(gapFileName) # If checkUnbridged is set, then also output overlap with unbridged gaps if checkUnbridged == True: # Create temp file to store unbridged gaps gapUnbrFileName = str(db) + ".gap.unbr.bed.temp" gapUnbrFile = open(gapUnbrFileName, 'w') # Get unbridged gaps for tbl in gapTableList: hgsqlGapOut = qaUtils.callHgsql( db, "select chrom, chromStart, chromEnd from " + tbl + " where bridge='no'") gapUnbrFile.write(hgsqlGapOut) gapUnbrFile.close() gapUnbrOverFile = str(db) + ".gapOver.unbr.bed" bedIntCmd = [ "bedIntersect", "-aHitAny", gapUnbrFileName, tableBedTemp.name, gapUnbrOverFile ] qaUtils.runCommand(bedIntCmd) gapUnbrOverUrls = constructOutputUrls(db, table, gapUnbrOverFile) gapOverlapOut += gapUnbrOverUrls # Remove intermediate files os.remove(gapUnbrOverFile) os.remove(gapUnbrFileName) return gapOverlapOut
def __pslCheck(self): """Runs pslCheck program on this table and sends result to reporter's filehandle.""" self.reporter.beginStep(self.db, self.table, "pslCheck") # Get baseColorUseSequence setting tdbCommand = ["tdbQuery", "select baseColorUseSequence from " + self.db +\ " where track='" + self.table +"' or table='"+ self.table +"'"] tdbOut, tdbErr = qaUtils.runCommand(tdbCommand) # Turn tdbOut into array so we can extract different pieces tdbOut = tdbOut.replace("\n","") tdbOut = tdbOut.split(" ") # tdbOut[1] should be what option is used for baseColorUseSequence # We want to fully run pslCheck for those entries with "table" and "extFile" if len(tdbOut) > 2: if tdbOut[1] == "table": # Get sizes of sequences in specified table sizesOut = qaUtils.callHgsql(self.db, "select name, length(seq) from " + tdbOut[2]) elif tdbOut[1] == "extFile": # Extract sizes from named seq table sizesOut = qaUtils.callHgsql(self.db, "select acc, size from " + tdbOut[2]) # Write itemSizes to tempfile itemSizesTemp = tempfile.NamedTemporaryFile(mode='w') itemSizes = open(itemSizesTemp.name, 'w') itemSizes.write(sizesOut) itemSizes.close() # Write chromSizes to tempfile chromSizesTemp = tempfile.NamedTemporaryFile(mode='w') chromSizesOut = qaUtils.callHgsql(self.db, "select chrom,size from chromInfo") chromSizes = open(chromSizesTemp.name, 'w') chromSizes.write(chromSizesOut) chromSizes.close() # Run more in-depth version of pslCheck command = ("pslCheck", "-querySizes=" + itemSizesTemp.name ,\ "-targetSizes=" + chromSizesTemp.name, "-db=" + self.db, self.table) self.reporter.writeCommand(command) commandOut, commandErr, commandReturnCode = qaUtils.runCommandNoAbort(command) # Write output to file self.reporter.fh.write(commandErr) # For everything else, use generic set of steps else: command = ["pslCheck", "db=" + self.db, self.table] self.reporter.writeCommand(command) commandOut, commandErr, commandReturnCode = qaUtils.runCommandNoAbort(command) self.reporter.fh.write(commandErr) if commandReturnCode: self.recordError() else: self.recordPass() self.reporter.endStep()
def checkSplit(db, table): """Check if table is split by chromosome""" splitCheck = qaUtils.callHgsql(db, "show tables like '" + table + "'") splitCheck = splitCheck.strip() splitTableList = [] if splitCheck == table: splitTableList.append(table) elif splitCheck == "": hgsqlTblsOut = qaUtils.callHgsql(db, "show tables like 'chr%" + table + "%'") splitTableList = hgsqlTblsOut.split('\n') splitTableList.remove("") return splitTableList
def checkSplit(db, table): """Check if table is split by chromosome""" splitCheck = qaUtils.callHgsql(db, "show tables like '" + table + "'") splitCheck = splitCheck.strip() splitTableList = [] if splitCheck == table: splitTableList.append(table) elif splitCheck == "": hgsqlTblsOut = qaUtils.callHgsql( db, "show tables like 'chr%" + table + "%'") splitTableList = hgsqlTblsOut.split('\n') splitTableList.remove("") return splitTableList
def checkTableIndex(database, tables): """ Check if each table has an index or not.""" notgbdbtablelist = tables - getGbdbTables(database, tables) tablelist = list() missing = set() output = [] if not notgbdbtablelist: output.append("No tables require an index") output.append("") return (output, missing) for i in notgbdbtablelist: hgsqlOut = qaUtils.callHgsql(database, "show indexes from " + i) index = hgsqlOut.split() if index: pass else: missing.add(i) if missing: output.append("Tables missing an index:") for i in missing: output.append(i) output.append("") else: output.append("No missing indices") output.append("") return (output, missing)
def __getChromCountsFromDatabase(self): """Returns a list of ['chrom', 'size (bases)', 'count', 'count/megabase'] lists for table""" chromCol = self.__getChromCol() # first half of the UNION below gets rows that do not appear in table at all query = ( "(SELECT chrom, size, 0 as count FROM chromInfo WHERE chrom NOT IN" + " (select distinct " + chromCol + " from " + self.table + "))" + " UNION" + " (SELECT inf.chrom, inf.size, count(tab." + chromCol + ") as count" + " FROM chromInfo as inf, " + self.table + " as tab" + " WHERE inf.chrom=tab." + chromCol + " GROUP BY chrom)" ) hgsqlOut = qaUtils.callHgsql(self.db, query) rows = hgsqlOut.split("\n") # makes list of strings with tabs embedded rows.remove("") # get rid of the empty list item created by the final newline rowsList = [] for row in rows: rowsList.append(row.split("\t")) # result is list of ['chrom', 'size', 'count'] for row in rowsList: # calculate count/size (in megabases) as floats, put in rowsList[3] row.append(float(row[2]) * 1000000 / float(row[1])) return rowsList
def checkCanPack(db, table): """Check if a track can be set to pack""" visibility = "pack" canPackOut = qaUtils.callHgsql(db, "select canPack from trackDb where tableName='" + table + "'") if canPackOut != 1: visibility = "full" return visibility
def getGbdbTables(database, tableset): """ Remove tables that aren't pointers to Gbdb files.""" sep = "','" tablestr = sep.join(tableset) tablestr = "'" + tablestr + "'" hgsqlOut = qaUtils.callHgsql(database, "select table_name from information_schema.columns where\ table_name in ("+ tablestr + ") and column_name = 'fileName'") gbdbtableset = set(hgsqlOut.split()) return gbdbtableset
def checkCanPack(db, table): """Check if a table can be set to pack""" visibility = "pack" canPackOut = qaUtils.callHgsql( db, "select canPack from trackDb where tableName='" + table + "'") if canPackOut != 1: visibility = "full" return visibility
def __checkTableDescription(self): """Checks for an autoSql definition for this table in the tableDescriptions table.""" # Ideally this would also check to see if each column in the table is described. self.reporter.beginStep(self.db, self.table, "checking for table descriptions") self.reporter.writeStepInfo() sqlOut = qaUtils.callHgsql(self.db, "select autoSqlDef from tableDescriptions where\ tableName='" + self.table + "'") if sqlOut.strip() == '': self.reporter.writeLine("No table description for " + self.db + "." + self.table) self.recordError() else: self.recordPass() self.reporter.endStep()
def __checkTableDescription(self): """Checks for an autoSql definition for this table in the tableDescriptions table.""" # Ideally this would also check to see if each column in the table is described. self.reporter.beginStep(self.db, self.table, "checking for table descriptions") self.reporter.writeStepInfo() error = False sqlOut = qaUtils.callHgsql( self.db, "select autoSqlDef from tableDescriptions where\ tableName='" + self.table + "'") if sqlOut.strip() == '': self.reporter.writeLine("ERROR: No table description for " + self.db + "." + self.table) error = True self.recordPassOrError(error) self.reporter.endStep()
def makeBedFromTable(db, table): """Make BED3 tempfile out of a table""" # Get chrom, start, and end column names trackType = tableTypeUtils.getTrackType(db, table) columnNames = tableTypeUtils.getChrStartEndCols(trackType) tableList = checkSplit(db, table) tableBedTemp = tempfile.NamedTemporaryFile(mode='w') tableBed = open(tableBedTemp.name, 'w') for tbl in tableList: hgsqlCmd = "select " + columnNames[0] + ", " + columnNames[1] + ", " \ + columnNames[2] + " from " + tbl hgsqlOut = qaUtils.callHgsql(db, hgsqlCmd) tableBed.write(hgsqlOut) tableBed.close() return tableBedTemp
def makeBedFromTable(db, table, whereOpt=""): """Make BED3 out of a table""" # Get chrom, start, and end column names trackType = getTrackType(db, table) columnNames = getChrStartEndCols(trackType) tableList = checkSplit(db, table) # whereOpt allows you to select only certain items from table, e.g. only unbridged gaps if whereOpt: whereOpt = " where " + whereOpt output = "" for tbl in tableList: hgsqlCmd = "select " + columnNames[0] + ", " + columnNames[1] + ", " \ + columnNames[2] + " from " + tbl + whereOpt hgsqlOut = qaUtils.callHgsql(db, hgsqlCmd) output += hgsqlOut return output
def __getChromCountsFromDatabase(self): """Returns a list of ['chrom', 'size (bases)', 'count', 'count/megabase'] lists for table""" chromCol = trackUtils.getChromCol(self.tableType) # first half of the UNION below gets rows that do not appear in table at all query = "(SELECT chrom, size, 0 as count FROM chromInfo WHERE chrom NOT IN" +\ " (select distinct " + chromCol + " from " + self.table + "))" +\ " UNION" +\ " (SELECT inf.chrom, inf.size, count(tab." + chromCol + ") as count" +\ " FROM chromInfo as inf, " + self.table + " as tab" +\ " WHERE inf.chrom=tab." + chromCol + " GROUP BY chrom)" hgsqlOut = qaUtils.callHgsql(self.db, query) rows = hgsqlOut.split("\n") # makes list of strings with tabs embedded rows.remove('') # get rid of the empty list item created by the final newline rowsList = [] for row in rows: rowsList.append(row.split("\t")) # result is list of ['chrom', 'size', 'count'] for row in rowsList: # calculate count/size (in megabases) as floats, put in rowsList[3] row.append(float(row[2])*1000000/float(row[1])) return rowsList
def checkTableDescriptions(database, tables): """ Check if each table has a description or not.""" tablelist = list() missing = set() output = [] orstr = "" for i in tables: tablelist.append("tableName = '%s'" % i) orsep = " OR " orstr = orsep.join(tablelist) hgsqlOut = qaUtils.callHgsql(database, "select tableName from tableDescriptions where " + orstr) described = set(hgsqlOut.split()) missing = tables - described if missing: output.append("Tables missing a description:") for i in missing: output.append(i) output.append("") else: output.append("No tables missing a description") output.append("") return (output, missing)
def countPerChrom(database, tables): """ Count the amount of rows per chromosome.""" notgbdbtablelist = tables - getGbdbTables(database, tables) tablecounts = dict() output = [] globalseen = set() localseen = dict() hgsqlOut = qaUtils.callHgsql(database, "select chrom from chromInfo") chrlist = set(hgsqlOut.split()) notPositionalTable = set() if not notgbdbtablelist: output.append("No tables to count chroms") output.append("") return (output, tablecounts) for i in notgbdbtablelist: counts = dict() cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() chrs = cmdoutput.split("\n")[1:-1] localseen[i] = set() if not chrs: notPositionalTable.add(i) continue for j in chrs: globalseen.add(j) if counts.has_key(j): counts[j] = counts[j] + 1 else: localseen[i].add(j) counts[j] = 1 tablecounts[i] = counts for i in sorted(tablecounts): output.append(i) used = set() for j in sorted_nicely(tablecounts[i]): output.append("%s = %s" % (j, tablecounts[i][j])) notused = chrlist - (localseen[i] | (chrlist - globalseen)) if notused: output.append("Seen by others, but not used here:") for j in sorted_nicely(notused): output.append(j) output.append("") globalnotused = chrlist - globalseen if globalnotused: output.append("Not seen anywhere:") for i in sorted_nicely(globalnotused): output.append(i) output.append("") if notPositionalTable: output.append("Not a positional table:") for i in notPositionalTable: output.append(i) return (output, tablecounts)
def __rowCount(self): """Adds the number of rows in this table to the sumRow.""" rowCount = qaUtils.callHgsql(self.db, "select count(*) from " + self.table) self.sumRow.setCount(rowCount.strip())