def run(self): if self.args.subName == "run": condDict = MU.parseExperimentCSV(self.args.infile) dataDict = { "Analysis": STANDARD if self.args.atype == "Standard" else MINIMAL, "Conditions": OrderedDict([(k, condDict[k]) for k in sorted(condDict.keys())]), "Description": self.args.desc, "Hypothesis": self.args.hyp, "RunBy": subprocess.Popen("whoami", stdout=subprocess.PIPE, shell=True).stdout.read().strip(), "Tags": ["PrimaryRetraining"], "Title": self.args.title, } mProjectR = MProjectFactory.getMProject("Runable", self.mDB) mProjectR.setFromDict(dataDict) mProjectR.register() print "Registered Milhouse Project %s" % mProjectR.ID elif self.args.subName == "status": mProject = self.mDB.getMProjectFromID(self.args.id) sDict = mProject.getStatusDict() print "Milhouse Project %d: %s [by: %s]" % (self.args.id, mProject.title, mProject.runby) print "Status: %s" % sDict["Status"] print "Martin: %.1f%%" % sDict["Conditions"] print "Analysis-Plots: %.1f%%" % sDict["Analysis"]["Plots"] print "Analysis-Tables: %.1f%%" % sDict["Analysis"]["Tables"]
def _validateCSV(self, csvFN): try: data = MU.getRecArrayFromCSV(self.args.infile, caseSensitive=True) CSVF_DEFAULT = MU.getCSVF("MartinJobID" in data.dtype.names, withExtras=True) # Check for correct usage of MartinType if "MartinType" in data.dtype.names: if not set(n.unique(data["MartinType"])).issubset(MU.MARTIN_ROOT.keys()): msg = "Invalid MartinType value, allowed values are = [%s]" % ", ".join(MU.MARTIN_ROOT.keys()) logging.error(msg) sys.exit(0) elif "smrtportal" in data["MartinType"]: if "MartinJobID" not in data.dtype.names: msg = "Smrportal conditions can not be run at this time and thus require a populated MartinJobID column" logging.error(msg) sys.exit(0) else: data = MU.addColumnToRecArray( data, [MU.MARTIN_ROOT.keys()[0]] * len(data), ("MartinType", "|S11"), tail=True ) # Check for unpopulated default columns wrngclmns = filter(lambda x: n.dtype(x[1]) == n.dtype(bool) and x[0] in MU.CSVF_ALL, data.dtype.descr) if wrngclmns: msg = "Incorrectly formatted CSV file:\n Column(s) [%s] have not been populated" % ", ".join( [c[0] for c in wrngclmns] ) logging.error(msg) sys.exit(0) # Check if the file contains the correct default column names if filter(lambda x: x not in data.dtype.names, CSVF_DEFAULT): msg = "Incorrectly formatted CSV file:\n Missing default column names from %s" % CSVF_DEFAULT logging.error(msg) sys.exit(0) # Check for correct naming of conditions if filter(lambda x: re.findall(r"[^A-Za-z0-9_\.\-]", x), data["Name"]): msg = "Incorrectly formatted CSV file:\n Condition names can only contain: alphanumeric characters, dashes (-), underscores (_) and dots (.)" logging.error(msg) sys.exit(0) # Check if the non-default columns have a p_ prefix extras = filter(lambda x: x not in MU.CSVF_ALL, data.dtype.names) if filter(lambda x: x[:2] != "p_", extras): msg = 'Incorrectly formatted CSV file:\n Extra parameters need to be named using a "p_" prefix' logging.error(msg) sys.exit(0) # Check if workflow provided exists in martin's list of workflows if "smrtportal" not in data["MartinType"]: mWkflowNames = self.mDBS.getMartinWkflowDict().keys() if filter(lambda x: x not in mWkflowNames, n.unique(data["MartinWorkflow"])): msg = "Unsupported Martin Workflow name provided." logging.error(msg) sys.exit(0) # Check if reference sequence provided exists in the reference repository if "MartinRefSeq" in data.dtype.names: wrongrefseqs = set( filter( lambda x: not glob.glob("%s/%s" % (MU.MARTIN_REFREPOS[x["MartinType"]], x["MartinRefSeq"])), data, ) ) if wrongrefseqs: msg = "The following reference sequence names are invalid: [%s]." % ",".join(wrongrefseqs) logging.error(msg) sys.exit(0) # Check for correctness MartinJobID values if "MartinJobID" in data.dtype.names: wrnglens = set(filter(lambda x: x != 6, map(lambda x: len(str(x)), data["MartinJobID"]))) if ( wrnglens and len(wrnglens) == 1 and wrnglens.issubset([5]) and "smrtportal" not in data["MartinType"] ): msg = "Invalid MartinJobID lengths supplied:\n If these are smrtportal jobs, you need to set MartinType to smrtportal" logging.error(msg) sys.exit(0) elif wrnglens and not wrnglens.issubset([5]): msg = ( "Invalid MartinJobID lengths supplied:\n Martin expects length == 6 and smrtportal length => 5" ) logging.error(msg) sys.exit(0) # Check whether primary folder names are contained within the given run codes if set(["RunCodes", "PrimaryFolder"]).issubset(data.dtype.names): for row in data: if len(row["RunCodes"].split("-")) == 2: exp, run = row["RunCodes"].split("-") if not glob.glob("/mnt/data*/vol*/%s/%s/%s" % (exp, run, row["PrimaryFolder"])): msg = "Run code [%s] does not contain primary folder [%s]." % ( row["RunCodes"], row["PrimaryFolder"], ) logging.error(msg) sys.exit(0) # Check for uniqueness of column values within conditions for cond in n.unique(data["Name"]): sl_data = data[data["Name"] == cond] if filter( lambda x: len(n.unique(sl_data[x])) != 1, [k for k in sl_data.dtype.names if k != "RunCodes"] ): msg = "For condition name=%s some of the attributes are NOT unique" % cond logging.error(msg) sys.exit(0) except ValueError as err: msg = "Incorrectly formatted CSV file:\n %s" % err logging.error(msg) sys.exit(0)
MINIMAL = ["tSummary", "readlength", "accuracy", "yield"] # Set MILHOUSE_HOME environment variable os.environ["MILHOUSE_HOME"] = os.path.join("/mnt/secondary/Share/Milhouse", server) # Get configuration info and check for MILHOUSE_HOME setting if not os.environ.get("MILHOUSE_HOME"): print "Environment variable MILHOUSE_HOME is not set! Exiting..." sys.exit(1) else: CONFDICT = os.path.join(os.environ.get("MILHOUSE_HOME"), "config", "milhouse.conf") if not os.path.isfile(CONFDICT): print "milhouse.conf stored at [%s] does not exist! Exiting..." % CONFDICT sys.exit(1) else: CONFDICT = MU.parseMilhouseConf(CONFDICT) print "Submitting Milhouse analysis project to %s server" % CONFDICT["MDB_TYPE"] # Tool for running Milhouse Jobs from command line class ToolRunner(object): def __init__(self): self.mDB = MDBCFactory.getMDBController( "data", mdbServer=CONFDICT["MDB_SERVER"], mdbPort=CONFDICT["MDB_PORT"], mlDataDir=CONFDICT["ML_DATADIR"] ) self.mDBS = self.mDB.getMDBExtra()[0] self._parseOptions() self._setupLogging() def _parseOptions(self): desc = [ "Tool for running and managing Milhouse projects from the command line.",