class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue def importFromDAS(self,datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % datasets if "*" in datasets: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % datasets, 0, 0, False, self.dbs_instance_) datasets=[] for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() for folder in folders: dsetName = "" while not len(dsetName.split("/")) == 4: print "enter dataset name for folder %s" % folder, dsetName = raw_input() print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_) ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" for dataset in catalog.keys(): self.checkDatasetFiles(dataset,catalog) outcomes = self.parallel_.wait() for dsetName,ifile,fName,ret,out in outcomes: info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True else: extraInfo = json.loads(str(out)) for key,val in extraInfo.iteritems(): info[key] = val print "Writing catalog" self.writeCatalog(catalog) print "Done" def checkDatasetFiles(self,dsetName,catalog=None): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_) wait = True print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print len(files) for ifile,finfo in enumerate(files): name = finfo["name"] self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile]) if wait: self.parallel_.wait() self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val))) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ ## fName = "root://eoscms//eos/cms%s" % fileName fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"] ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2] try: fout = open(tmp) out = fout.read() fout.close() except IOError, e: print ret, out print e out = "{}" os.remove(tmp) return dsetName,ifile,fileName,ret,out
class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None, maxThreads=200,force=False,doContinue=False ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue self.maxThreads_ = maxThreads self.force_ = force self.continue_ = doContinue def importFromDAS(self,list_datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % list_datasets datasets = [] for dataset in list_datasets: if "*" in dataset: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_) for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) else: datasets.append(dataset) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) self.addToDataset(catalog,dsetName,files) ## if dsetName in catalog: ## if self.force_: ## catalog[ dsetName ]["files"] = files ## else: ## self.mergeDataset(catalog[ dsetName ],{ "files" : files }) ## else: ## catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() auto=False assumeOk=False for folder in folders: dsetName = "" print print "importing folder\n %s" % folder while not len(dsetName.split("/")) == 4: if auto: splitFolder = folder.split("/") prim, sec = splitFolder[-4:-2] dsetName = "/%s/%s/USER" % (prim,sec) print "guessed dataset name ", dsetName if not assumeOk: resp=ask_user("ok?",["y","n","a"]) if resp == "n": dsetName = "" auto=False elif resp=="a": assumeOk=True if not auto: print "enter dataset name (auto/noauto to enables/disables automatic guessing) ", dsetName = raw_input() if(dsetName=="auto"): auto=True elif (dsetName=="noauto"): auto=False print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) self.addToDataset(catalog,dsetName,files) ## if dsetName in catalog: ## catalog[ dsetName ]["files"] = files ## else: ## catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] ## print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self,match=None,light=False): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True,lsfJobName=".fgg/job") ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" self.outcomes = [] for dataset in catalog.keys(): if match and not fnmatch(dataset,match): continue self.checkDatasetFiles(dataset,catalog,light=light) # write catalog to avoid redoing duplicates removal self.writeCatalog(catalog) if self.queue_: self.parallel_.wait(printOutput=True,handler=self) outcomes = self.outcomes else: outcomes = self.parallel_.wait(printOutput=False) ## for dsetName,ifile,fName,ret,out in outcomes: nfailed = 0 for oc in outcomes: ign1, ign2, outcome= oc ## for ign1, ign2, outcome in outcomes: dsetName,ifile,fName,ret,out = outcome info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True nfailed += 1 else: info["bad"] = False extraInfo = json.loads(str(out)) if len(extraInfo.keys()) == 0: nfailed += 1 info["bad"] = True for key,val in extraInfo.iteritems(): info[key] = val self.parallel_.stop() print "Writing catalog" self.writeCatalog(catalog) print "Done" if nfailed > 0: print print "WARNING: some of the check jobs failed or did not return any output." print " Those (%d) files were marked a bad and won't be usable for analysis." % nfailed print " Re-running the check command may recover the temporary failures." print if self.queue_: print print "Note: log files may have been written in ./.fgg" print " it's up to you to clean up though..." def checkDatasetFiles(self,dsetName,catalog=None,light=False): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True) wait = True print print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print "Number of files: ", len(files) if self.force_ or not catalog[dsetName].get("vetted",False): toremove = [] keep_wildcard=None for ifil,eifil in enumerate(files): if ifil in toremove: continue for jfil,ejfil in enumerate(files[ifil+1:]): if ifil+jfil in toremove: continue if eifil["name"] == ejfil["name"]: toremove.append(ifil) else: iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1] jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1] if iid == jid: if not keep_wildcard: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both (yes/no/matching)? ",["y","n","m"]) if reply == "m": while not keep_wildcard: print "enter wildcard matching expression", keep_wildcard=raw_input() if ask_user("keep all files matching '%s'?" % keep_wildcard) == "n": keep_wildcard=None if keep_wildcard: imatch=fnmatch(eifil["name"],keep_wildcard) jmatch=fnmatch(ejfil["name"],keep_wildcard) if imatch != jmatch: if imatch: toremove.append(ifil+jfil) else: toremove.append(ifil) continue else: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both? ") if reply == "n": if ask_user( "keep %s? " % ejfil["name"] ) == "n": ## files.pop(ifil+jfil) toremove.append(ifil+jfil) if ask_user( "keep %s? " % eifil["name"] ) == "n": toremove.append(ifil) ## files.pop(ifil) for ifile in sorted(toremove,reverse=True): ## print ifile files.pop(ifile) print "After duplicates removal: ", len(files) nsub = 0 catalog[dsetName]["vetted"] = True if not light: info = catalog[dsetName]["files"] = files for ifile,finfo in enumerate(files): name = finfo["name"] if self.force_ or not "weights" in finfo: nsub+=1 self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile],interactive=(self.queue_!=None)) if nsub == 0: print "No files needed to be checked" else: print "Submitted %d check jobs" % nsub if wait: self.parallel_.wait(printOutput=False) self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all (yes/no/merge)?" % (name,"\n ".join(val)),["y","n","m"]) if reply == "m": dst = val[0] for merge in val[1:]: self.mergeDataset(catalog[dst],catalog[merge]) catalog.pop(merge) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def mergeDataset(self,dst,merge): dst["vetted"]=False dstFiles=dst["files"] mergeFiles=merge["files"] for fil in mergeFiles: skip = False for dfil in dstFiles: if dfil["name"] == fil["name"]: skip = True if not skip: dstFiles.append( fil ) def addToDataset(self,catalog,dsetName,files): if dsetName in catalog: if self.force_: catalog[ dsetName ]["files"] = files else: self.mergeDataset(catalog[ dsetName ],{ "files" : files }) else: catalog[ dsetName ] = { "files" : files } def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) if self.continue_: if os.path.exists(tmp): print "%s already exists" % tmp outcome = self.readJobOutput(tmp,0,"",dsetName,fileName,ifile) if self.queue_: self.outcomes.append((None,None,outcome)) else: return outcome return None if self.queue_: self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=False) else: ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=True)[2] return self.readJobOutput(tmp,ret,out,dsetName,fileName,ifile) ### try: ### fout = open(tmp) ### out = fout.read() ### fout.close() ### except IOError, e: ### print ret, out ### print e ### out = "{}" ### ### os.remove(tmp) ### return dsetName,ifile,fileName,ret,out def readJobOutput(self,tmp,ret,out,dsetName,fileName,ifile): try: fout = open(tmp) out = fout.read() fout.close() os.remove(tmp) except Exception, e: print ret, out print e out = "{}" return dsetName,int(ifile),fileName,ret,out
class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue def importFromDAS(self,list_datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % list_datasets datasets = [] for dataset in list_datasets: if "*" in dataset: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_) for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) else: datasets.append(dataset) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() auto=False assumeOk=False for folder in folders: dsetName = "" print print "importing folder\n %s" % folder while not len(dsetName.split("/")) == 4: if auto: splitFolder = folder.split("/") prim, sec = splitFolder[-4:-2] dsetName = "/%s/%s/USER" % (prim,sec) print "guessed dataset name ", dsetName if not assumeOk: resp=ask_user("ok?",["y","n","a"]) if resp == "n": dsetName = "" auto=False elif resp=="a": assumeOk=True if not auto: print "enter dataset name (auto/noauto to enables/disables automatic guessing) ", dsetName = raw_input() if(dsetName=="auto"): auto=True elif (dsetName=="noauto"): auto=False print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] ## print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_) ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" for dataset in catalog.keys(): self.checkDatasetFiles(dataset,catalog) outcomes = self.parallel_.wait(printOutput=False) ## for dsetName,ifile,fName,ret,out in outcomes: for ign1, ign2, outcome in outcomes: dsetName,ifile,fName,ret,out = outcome info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True else: extraInfo = json.loads(str(out)) for key,val in extraInfo.iteritems(): info[key] = val print "Writing catalog" self.writeCatalog(catalog) print "Done" def checkDatasetFiles(self,dsetName,catalog=None): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_) wait = True print print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print "Number of files: ", len(files) toremove = [] for ifil,eifil in enumerate(files): if ifil in toremove: continue for jfil,ejfil in enumerate(files[ifil+1:]): if ifil+jfil in toremove: continue if eifil["name"] == ejfil["name"]: toremove.append(ifil) else: iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1] jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1] if iid == jid: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both? ") if reply == "n": if ask_user( "keep %s? " % ejfil["name"] ) == "n": ## files.pop(ifil+jfil) toremove.append(ifil+jfil) if ask_user( "keep %s? " % eifil["name"] ) == "n": toremove.append(ifil) ## files.pop(ifil) for ifile in sorted(toremove,reverse=True): ## print ifile files.pop(ifile) print "After duplicates removal: ", len(files) info = catalog[dsetName]["files"] = files for ifile,finfo in enumerate(files): name = finfo["name"] self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile]) if wait: self.parallel_.wait(printOutput=False) self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val)),["y","n","m"]) if reply == "m": dst = val[0] for merge in val[1:]: self.mergeDataset(catalog[dst],catalog[merge]) catalog.pop(merge) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def mergeDataset(self,dst,merge): dstFiles=dst["files"] mergeFiles=merge["files"] for fil in mergeFiles: skip = False for dfil in dstFiles: if dfil["name"] == fil["name"]: skip = True if not skip: dstFiles.append( fil ) def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ ## fName = "root://eoscms//eos/cms%s" % fileName fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"] ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2] try: fout = open(tmp) out = fout.read() fout.close() except IOError, e: print ret, out print e out = "{}" os.remove(tmp) return dsetName,ifile,fileName,ret,out