Esempio n. 1
0
class SamplesManager(object):
    
    def __init__(self,
                 catalog,
                 cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"],
                 dbs_instance="prod/phys03",
                 queue=None
                 ):
        """
        Constructur:
        @catalog: json file used to read/write dataset information
        @cross_sections: json file where samples cross sections are stored
        @dbs_instance: DBS instance tp use
        """
        self.cross_sections_ = {}
        self.dbs_instance_ = dbs_instance

        for xsecFile in cross_sections:
            fname = shell_expand(xsecFile)
            self.cross_sections_.update( json.loads( open(fname).read() ) )
            
        self.catalog_ = shell_expand(catalog)

        self.parallel_ = None
        self.sem_ = Semaphore()

        print "Will use the following datasets catalog:"
        print self.catalog_
        
        self.queue_ = queue
        
    def importFromDAS(self,datasets):
        """
        Import datasets from DAS to the catalog.
        @datasets: wildecard to be usd in dataset query
        """
        catalog = self.readCatalog()
        
        print "Importing from das %s" % datasets
        if "*" in datasets:
            response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % datasets, 0, 0, False, self.dbs_instance_)
        
            datasets=[]
            for d in response["data"]:
                datasets.append( d["dataset"][0]["name"] )
            print "Datasets to import"
            print "\n".join(datasets)
            
        for dsetName in datasets:
            print "Importing %s" % dsetName
            files = self.getFilesFomDAS(dsetName)
            if dsetName in catalog:
                catalog[ dsetName ]["files"]  = files
            else:
                catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"

    def getFilesFomDAS(self,dsetName):
        """
        Read dataset files from DAS.
        @dsetName: dataset name
        """
        response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_)
        
        files=[]
        for d in response["data"]:
            for jf in d["file"]:
                if "nevents" in jf:
                    files.append({ "name" : jf["name"], "nevents" : jf["nevents"] })
                    break
                ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } )

        return files

    def importFromEOS(self,folders):
        """
        Import datasets from DAS to the catalog.
        @datasets: dataset to be imported
        """
        catalog = self.readCatalog()

        for folder in folders:
            dsetName = ""
            while not len(dsetName.split("/")) == 4:
                print "enter dataset name for folder %s" % folder, 
                dsetName = raw_input()
                
            print "Importing %s as %s" % (folder,dsetName)
            files = self.getFilesFomEOS(folder)            
            if dsetName in catalog:
                catalog[ dsetName ]["files"]  = files
            else:
                catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
        
    def getFilesFomEOS(self,dsetName):
        """
        Read dataset files crawling EOS.
        @dsetName: dataset name
        Note: not implemented
        """
        
        if not self.parallel_:
            self.parallel_ = Parallel(200,self.queue_)
        
        ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2]
        print out
        files = []
        for line in out.split("\n"):
            if line.endswith(".root"):
                files.append( {"name":line.replace("/eos/cms",""), "nevents":0} )

        return files

    def findDuplicates(self,dsetName):
        """
        Find duplicate job outputs in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        pass
    
    def invalidateBadFiles(self,dsetName):
        """
        Invalidate duplicate job output and corrupted files in DAS.
        @dsetName: dataset name
        Note: not implemented
        """
        pass

    def checkAllDatasets(self):
        """
        Look for corrupted files in the whole catalog.
        """
        catalog = self.readCatalog()
        
        self.parallel_ = Parallel(50,self.queue_)
        ## self.parallel_ = Parallel(1,self.queue_)

        print "Checking all datasets"
        for dataset in catalog.keys():            
            self.checkDatasetFiles(dataset,catalog)
        
        outcomes = self.parallel_.wait()
        for dsetName,ifile,fName,ret,out in outcomes:
            info = catalog[dsetName]["files"][ifile]
            if info["name"] != fName:
                print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out
            else:
                if ret != 0:
                    info["bad"] = True
                else:
                    extraInfo = json.loads(str(out))
                    for key,val in extraInfo.iteritems():
                        info[key] = val

            print "Writing catalog"
            self.writeCatalog(catalog)
        
        print "Done"
    
    def checkDatasetFiles(self,dsetName,catalog=None):
        """
        Look for corrupted files in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        writeCatalog = False
        if not catalog:
            catalog = self.readCatalog()
            writeCatalog = True
        
        wait = False
        if not self.parallel_:
            self.parallel_ = Parallel(16,self.queue_)
            wait = True

        print "Checking dataset",dsetName
        info = catalog[dsetName]
        files = info["files"]

        print len(files)
        for ifile,finfo in enumerate(files):            
            name = finfo["name"]
            self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile])

        if wait:
            self.parallel_.wait()            
            self.parallel_ = None
        if writeCatalog:
            self.writeCatalog(catalog)

    def reviewCatalog(self):
        datasets,catalog = self.getAllDatasets()

        primaries = {}
        keepAll = False
        for d in datasets:
            if not keepAll:
                reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"])
                if reply == "n":
                    catalog.pop(d)
                    continue
                if reply == "a": 
                    keepAll = True
                    
            primary = d.split("/")[1]
            if not primary in primaries:
                primaries[ primary ] = []
                
            primaries[ primary ].append(d)
            
        for name,val in primaries.iteritems():
            if len(val) == 1: continue
            reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val)))
            if reply == "n":
                for d in val:
                    reply = ask_user("keep this dataset?\n %s\n" % d)
                    if reply == "n":
                        catalog.pop(d)
           
        self.writeCatalog(catalog)

    def checkFile(self,fileName,dsetName,ifile):
        """
        Check if file is valid.
        @fileName: file name
        """
        ## fName = "root://eoscms//eos/cms%s" % fileName
        fName = fileName
        tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile)
        ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"]
        ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2]
        
        try:
            fout = open(tmp)
            out = fout.read()
            fout.close()
        except IOError, e:
            print ret, out 
            print e
            out = "{}"

        os.remove(tmp)
        return dsetName,ifile,fileName,ret,out
Esempio n. 2
0
class SamplesManager(object):
    
    def __init__(self,
                 catalog,
                 cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"],
                 dbs_instance="prod/phys03",
                 queue=None, maxThreads=200,force=False,doContinue=False
                 ):
        """
        Constructur:
        @catalog: json file used to read/write dataset information
        @cross_sections: json file where samples cross sections are stored
        @dbs_instance: DBS instance tp use
        """
        self.cross_sections_ = {}
        self.dbs_instance_ = dbs_instance

        for xsecFile in cross_sections:
            fname = shell_expand(xsecFile)
            self.cross_sections_.update( json.loads( open(fname).read() ) )
            
        self.catalog_ = shell_expand(catalog)

        self.parallel_ = None
        self.sem_ = Semaphore()

        print "Will use the following datasets catalog:"
        print self.catalog_
        
        self.queue_ = queue
        self.maxThreads_ = maxThreads
        self.force_ = force
        self.continue_ = doContinue

    def importFromDAS(self,list_datasets):
        """
        Import datasets from DAS to the catalog.
        @datasets: wildecard to be usd in dataset query
        """
        catalog = self.readCatalog()
        
        print "Importing from das %s" % list_datasets
        datasets = []
        for dataset in list_datasets:
            if "*" in dataset:
                response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_)
        
                for d in response["data"]:
                    datasets.append( d["dataset"][0]["name"] )
            else:
                datasets.append(dataset)

        print "Datasets to import"
        print "\n".join(datasets)
        for dsetName in datasets:
            print "Importing %s" % dsetName
            files = self.getFilesFomDAS(dsetName)
            self.addToDataset(catalog,dsetName,files)
            ## if dsetName in catalog:
            ##     if self.force_:
            ##         catalog[ dsetName ]["files"]  = files
            ##     else:
            ##         self.mergeDataset(catalog[ dsetName ],{ "files" : files })
            ## else:
            ##     catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
    

    def getFilesFomDAS(self,dsetName):
        """
        Read dataset files from DAS.
        @dsetName: dataset name
        """
        response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_)
        
        files=[]
        for d in response["data"]:
            for jf in d["file"]:
                if "nevents" in jf:
                    files.append({ "name" : jf["name"], "nevents" : jf["nevents"] })
                    break
                ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } )

        return files

    def importFromEOS(self,folders):
        """
        Import datasets from DAS to the catalog.
        @datasets: dataset to be imported
        """
        catalog = self.readCatalog()
        
        auto=False
        assumeOk=False
        for folder in folders:
            dsetName = ""

            print
            print "importing folder\n %s" % folder
            
            while not len(dsetName.split("/")) == 4:
                if auto:
                    splitFolder = folder.split("/")
                    prim, sec = splitFolder[-4:-2]
                    dsetName = "/%s/%s/USER" % (prim,sec)
                    print "guessed dataset name ", dsetName
                    if not assumeOk:
                        resp=ask_user("ok?",["y","n","a"])
                        if resp == "n":
                            dsetName = ""
                            auto=False
                        elif resp=="a":
                            assumeOk=True
                if not auto:
                    print "enter dataset name (auto/noauto to enables/disables automatic guessing) ",
                    dsetName = raw_input()
                    if(dsetName=="auto"):
                        auto=True
                    elif (dsetName=="noauto"):
                        auto=False
                
                
            print "Importing %s as %s" % (folder,dsetName)
            files = self.getFilesFomEOS(folder)            
            self.addToDataset(catalog,dsetName,files)
            ## if dsetName in catalog:
            ##     catalog[ dsetName ]["files"]  = files
            ## else:
            ##     catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
        
    def getFilesFomEOS(self,dsetName):
        """
        Read dataset files crawling EOS.
        @dsetName: dataset name
        Note: not implemented
        """
        
        if not self.parallel_:
            self.parallel_ = Parallel(200,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True)
        
        ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2]
        ## print out
        files = []
        for line in out.split("\n"):
            if line.endswith(".root"):
                files.append( {"name":line.replace("/eos/cms",""), "nevents":0} )

        return files

    def findDuplicates(self,dsetName):
        """
        Find duplicate job outputs in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        pass
    
    def invalidateBadFiles(self,dsetName):
        """
        Invalidate duplicate job output and corrupted files in DAS.
        @dsetName: dataset name
        Note: not implemented
        """
        pass

    def checkAllDatasets(self,match=None,light=False):
        """
        Look for corrupted files in the whole catalog.
        """
        catalog = self.readCatalog()
        
        self.parallel_ = Parallel(50,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True,lsfJobName=".fgg/job")
        ## self.parallel_ = Parallel(1,self.queue_)

        print "Checking all datasets"
        self.outcomes = []
        for dataset in catalog.keys():  
            if match and not fnmatch(dataset,match): continue
            self.checkDatasetFiles(dataset,catalog,light=light)
        # write catalog to avoid redoing duplicates removal
        self.writeCatalog(catalog)
                
        if self.queue_:
            self.parallel_.wait(printOutput=True,handler=self)
            outcomes = self.outcomes
        else:
            outcomes = self.parallel_.wait(printOutput=False)

        ## for dsetName,ifile,fName,ret,out in outcomes:
        nfailed = 0
        for oc in outcomes:
            ign1, ign2, outcome= oc
            ## for ign1, ign2, outcome in outcomes:
            dsetName,ifile,fName,ret,out = outcome
            info = catalog[dsetName]["files"][ifile]
            if info["name"] != fName:
                print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out
            else:
                if ret != 0:
                    info["bad"] = True
                    nfailed += 1
                else:
                    info["bad"] = False
                    extraInfo = json.loads(str(out))
                    if len(extraInfo.keys()) == 0:
                        nfailed += 1
                        info["bad"] = True
                    for key,val in extraInfo.iteritems():
                        info[key] = val

        self.parallel_.stop()

        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"

        if nfailed > 0:
            print 
            print "WARNING: some of the check jobs failed or did not return any output."
            print "         Those (%d) files were marked a bad and won't be usable for analysis." % nfailed
            print "         Re-running the check command may recover the temporary failures."
            print 
        
        if self.queue_:
            print 
            print "Note: log files may have been written in ./.fgg"
            print "      it's up to you to clean up though..."

    
    def checkDatasetFiles(self,dsetName,catalog=None,light=False):
        """
        Look for corrupted files in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        writeCatalog = False
        if not catalog:
            catalog = self.readCatalog()
            writeCatalog = True
        
        wait = False
        if not self.parallel_:
            self.parallel_ = Parallel(16,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True)
            wait = True

        print 
        print "Checking dataset",dsetName
        info = catalog[dsetName]
        files = info["files"]
        print "Number of files: ", len(files)
        
        if self.force_ or not catalog[dsetName].get("vetted",False):
            toremove = []
            keep_wildcard=None
            for ifil,eifil in enumerate(files):
                if ifil in toremove:
                    continue
                for jfil,ejfil in enumerate(files[ifil+1:]):
                    if ifil+jfil in toremove:
                        continue
                    if eifil["name"] == ejfil["name"]:
                        toremove.append(ifil)
                    else:
                        iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1]
                        jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1]
                        if iid == jid:
                            if not keep_wildcard:
                                print "duplicated file index ", iid
                                print eifil["name"]
                                print ejfil["name"]
                                reply=ask_user("keep both (yes/no/matching)? ",["y","n","m"])
                                if reply == "m":             
                                    while not keep_wildcard:
                                        print "enter wildcard matching expression",
                                        keep_wildcard=raw_input()
                                        if ask_user("keep all files matching '%s'?" % keep_wildcard) == "n":
                                            keep_wildcard=None
                            if keep_wildcard:                            
                                imatch=fnmatch(eifil["name"],keep_wildcard)
                                jmatch=fnmatch(ejfil["name"],keep_wildcard)
                                if imatch != jmatch:
                                    if imatch: toremove.append(ifil+jfil)
                                    else: toremove.append(ifil)                            
                                    continue                       
                                else:
                                    print "duplicated file index ", iid
                                    print eifil["name"]
                                    print ejfil["name"]
                                    reply=ask_user("keep both? ")
                            if reply == "n":
                                if ask_user( "keep %s? " % ejfil["name"] ) == "n":
                                    ## files.pop(ifil+jfil)
                                    toremove.append(ifil+jfil)
                                if ask_user( "keep %s? " % eifil["name"] ) == "n":
                                    toremove.append(ifil)
                                    ## files.pop(ifil)
                                    
            for ifile in sorted(toremove,reverse=True):
                ## print ifile
                files.pop(ifile)
            
        print "After duplicates removal: ", len(files)
        nsub = 0
        catalog[dsetName]["vetted"] = True
        if not light:
            info = catalog[dsetName]["files"] = files
            for ifile,finfo in enumerate(files):            
                name = finfo["name"]
                if self.force_ or not "weights" in finfo:
                    nsub+=1
                    self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile],interactive=(self.queue_!=None))
        if nsub == 0:
            print "No files needed to be checked"
        else:
            print "Submitted %d check jobs" % nsub
            
        if wait:
            self.parallel_.wait(printOutput=False)            
            self.parallel_ = None
            
        if writeCatalog:
            self.writeCatalog(catalog)

    def reviewCatalog(self):
        datasets,catalog = self.getAllDatasets()

        primaries = {}
        keepAll = False
        for d in datasets:
            if not keepAll:
                reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"])
                if reply == "n":
                    catalog.pop(d)
                    continue
                if reply == "a": 
                    keepAll = True
            primary = d.split("/")[1]
            if not primary in primaries:
                primaries[ primary ] = []
                
            primaries[ primary ].append(d)
            
        for name,val in primaries.iteritems():
            if len(val) == 1: continue
            reply = ask_user("More than one sample for %s:\n %s\nKeep all (yes/no/merge)?" % (name,"\n ".join(val)),["y","n","m"])
            if reply == "m":
                dst = val[0]
                for merge in val[1:]:
                    self.mergeDataset(catalog[dst],catalog[merge])
                    catalog.pop(merge)
            if reply == "n":
                for d in val:
                    reply = ask_user("keep this dataset?\n %s\n" % d)
                    if reply == "n":
                        catalog.pop(d)
           
        self.writeCatalog(catalog)
        
    def mergeDataset(self,dst,merge):
        dst["vetted"]=False
        dstFiles=dst["files"]
        mergeFiles=merge["files"]
        for fil in mergeFiles:
            skip = False
            for dfil in dstFiles:
                if dfil["name"] == fil["name"]:
                    skip = True
            if not skip:
                dstFiles.append( fil )
        
    def addToDataset(self,catalog,dsetName,files):
        if dsetName in catalog:
            if self.force_:
                catalog[ dsetName ]["files"]  = files
            else:
                self.mergeDataset(catalog[ dsetName ],{ "files" : files })
        else:
            catalog[ dsetName ] = { "files" : files }


    def checkFile(self,fileName,dsetName,ifile):
        """
        Check if file is valid.
        @fileName: file name
        """
        fName = fileName
        tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile)
        if self.continue_:
            if os.path.exists(tmp):
                print "%s already exists" % tmp
                outcome = self.readJobOutput(tmp,0,"",dsetName,fileName,ifile)
                if self.queue_:
                    self.outcomes.append((None,None,outcome))
                else:
                    return outcome
            return None
        if self.queue_:
            self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=False)
        else:
            ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=True)[2]
            return self.readJobOutput(tmp,ret,out,dsetName,fileName,ifile)

        ### try:
        ###     fout = open(tmp)
        ###     out = fout.read()
        ###     fout.close()
        ### except IOError, e:
        ###     print ret, out 
        ###     print e
        ###     out = "{}"
        ### 
        ### os.remove(tmp)
        ### return dsetName,ifile,fileName,ret,out
        

    def readJobOutput(self,tmp,ret,out,dsetName,fileName,ifile):
        try:
            fout = open(tmp)
            out = fout.read()
            fout.close()
            os.remove(tmp)
        except Exception, e:
            print ret, out 
            print e
            out = "{}"

        return dsetName,int(ifile),fileName,ret,out
Esempio n. 3
0
class SamplesManager(object):
    
    def __init__(self,
                 catalog,
                 cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"],
                 dbs_instance="prod/phys03",
                 queue=None
                 ):
        """
        Constructur:
        @catalog: json file used to read/write dataset information
        @cross_sections: json file where samples cross sections are stored
        @dbs_instance: DBS instance tp use
        """
        self.cross_sections_ = {}
        self.dbs_instance_ = dbs_instance

        for xsecFile in cross_sections:
            fname = shell_expand(xsecFile)
            self.cross_sections_.update( json.loads( open(fname).read() ) )
            
        self.catalog_ = shell_expand(catalog)

        self.parallel_ = None
        self.sem_ = Semaphore()

        print "Will use the following datasets catalog:"
        print self.catalog_
        
        self.queue_ = queue
        
    def importFromDAS(self,list_datasets):
        """
        Import datasets from DAS to the catalog.
        @datasets: wildecard to be usd in dataset query
        """
        catalog = self.readCatalog()
        
        print "Importing from das %s" % list_datasets
        datasets = []
        for dataset in list_datasets:
            if "*" in dataset:
                response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_)
        
                for d in response["data"]:
                    datasets.append( d["dataset"][0]["name"] )
            else:
                datasets.append(dataset)

        print "Datasets to import"
        print "\n".join(datasets)
        for dsetName in datasets:
            print "Importing %s" % dsetName
            files = self.getFilesFomDAS(dsetName)
            if dsetName in catalog:
                catalog[ dsetName ]["files"]  = files
            else:
                catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"

    def getFilesFomDAS(self,dsetName):
        """
        Read dataset files from DAS.
        @dsetName: dataset name
        """
        response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_)
        
        files=[]
        for d in response["data"]:
            for jf in d["file"]:
                if "nevents" in jf:
                    files.append({ "name" : jf["name"], "nevents" : jf["nevents"] })
                    break
                ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } )

        return files

    def importFromEOS(self,folders):
        """
        Import datasets from DAS to the catalog.
        @datasets: dataset to be imported
        """
        catalog = self.readCatalog()
        
        auto=False
        assumeOk=False
        for folder in folders:
            dsetName = ""

            print
            print "importing folder\n %s" % folder
            
            while not len(dsetName.split("/")) == 4:
                if auto:
                    splitFolder = folder.split("/")
                    prim, sec = splitFolder[-4:-2]
                    dsetName = "/%s/%s/USER" % (prim,sec)
                    print "guessed dataset name ", dsetName
                    if not assumeOk:
                        resp=ask_user("ok?",["y","n","a"])
                        if resp == "n":
                            dsetName = ""
                            auto=False
                        elif resp=="a":
                            assumeOk=True
                if not auto:
                    print "enter dataset name (auto/noauto to enables/disables automatic guessing) ",
                    dsetName = raw_input()
                    if(dsetName=="auto"):
                        auto=True
                    elif (dsetName=="noauto"):
                        auto=False
                
                
            print "Importing %s as %s" % (folder,dsetName)
            files = self.getFilesFomEOS(folder)            
            if dsetName in catalog:
                catalog[ dsetName ]["files"]  = files
            else:
                catalog[ dsetName ] = { "files" : files }
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
        
    def getFilesFomEOS(self,dsetName):
        """
        Read dataset files crawling EOS.
        @dsetName: dataset name
        Note: not implemented
        """
        
        if not self.parallel_:
            self.parallel_ = Parallel(200,self.queue_)
        
        ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2]
        ## print out
        files = []
        for line in out.split("\n"):
            if line.endswith(".root"):
                files.append( {"name":line.replace("/eos/cms",""), "nevents":0} )

        return files

    def findDuplicates(self,dsetName):
        """
        Find duplicate job outputs in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        pass
    
    def invalidateBadFiles(self,dsetName):
        """
        Invalidate duplicate job output and corrupted files in DAS.
        @dsetName: dataset name
        Note: not implemented
        """
        pass

    def checkAllDatasets(self):
        """
        Look for corrupted files in the whole catalog.
        """
        catalog = self.readCatalog()
        
        self.parallel_ = Parallel(50,self.queue_)
        ## self.parallel_ = Parallel(1,self.queue_)

        print "Checking all datasets"
        for dataset in catalog.keys():            
            self.checkDatasetFiles(dataset,catalog)
        
        outcomes = self.parallel_.wait(printOutput=False)

        ## for dsetName,ifile,fName,ret,out in outcomes:
        for ign1, ign2, outcome in outcomes:
            dsetName,ifile,fName,ret,out = outcome
            info = catalog[dsetName]["files"][ifile]
            if info["name"] != fName:
                print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out
            else:
                if ret != 0:
                    info["bad"] = True
                else:
                    extraInfo = json.loads(str(out))
                    for key,val in extraInfo.iteritems():
                        info[key] = val

        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
    
    def checkDatasetFiles(self,dsetName,catalog=None):
        """
        Look for corrupted files in dataset.
        @dsetName: dataset name
        Note: not implemented
        """
        writeCatalog = False
        if not catalog:
            catalog = self.readCatalog()
            writeCatalog = True
        
        wait = False
        if not self.parallel_:
            self.parallel_ = Parallel(16,self.queue_)
            wait = True

        print 
        print "Checking dataset",dsetName
        info = catalog[dsetName]
        files = info["files"]
        print "Number of files: ", len(files)
        
        toremove = []
        for ifil,eifil in enumerate(files):
            if ifil in toremove:
                continue
            for jfil,ejfil in enumerate(files[ifil+1:]):
                if ifil+jfil in toremove:
                    continue
                if eifil["name"] == ejfil["name"]:
                    toremove.append(ifil)
                else:
                    iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1]
                    jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1]
                    if iid == jid:
                        print "duplicated file index ", iid
                        print eifil["name"]
                        print ejfil["name"]
                        reply=ask_user("keep both? ")
                        if reply == "n":
                            if ask_user( "keep %s? " % ejfil["name"] ) == "n":
                                ## files.pop(ifil+jfil)
                                toremove.append(ifil+jfil)
                            if ask_user( "keep %s? " % eifil["name"] ) == "n":
                                toremove.append(ifil)
                                ## files.pop(ifil)
                                
        for ifile in sorted(toremove,reverse=True):
            ## print ifile
            files.pop(ifile)
            
        print "After duplicates removal: ", len(files)
        info = catalog[dsetName]["files"] = files
        for ifile,finfo in enumerate(files):            
            name = finfo["name"]
            self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile])

        if wait:
            self.parallel_.wait(printOutput=False)            
            self.parallel_ = None
            
        if writeCatalog:
            self.writeCatalog(catalog)

    def reviewCatalog(self):
        datasets,catalog = self.getAllDatasets()

        primaries = {}
        keepAll = False
        for d in datasets:
            if not keepAll:
                reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"])
                if reply == "n":
                    catalog.pop(d)
                    continue
                if reply == "a": 
                    keepAll = True
            primary = d.split("/")[1]
            if not primary in primaries:
                primaries[ primary ] = []
                
            primaries[ primary ].append(d)
            
        for name,val in primaries.iteritems():
            if len(val) == 1: continue
            reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val)),["y","n","m"])
            if reply == "m":
                dst = val[0]
                for merge in val[1:]:
                    self.mergeDataset(catalog[dst],catalog[merge])
                    catalog.pop(merge)
            if reply == "n":
                for d in val:
                    reply = ask_user("keep this dataset?\n %s\n" % d)
                    if reply == "n":
                        catalog.pop(d)
           
        self.writeCatalog(catalog)
        
    def mergeDataset(self,dst,merge):
        dstFiles=dst["files"]
        mergeFiles=merge["files"]
        for fil in mergeFiles:
            skip = False
            for dfil in dstFiles:
                if dfil["name"] == fil["name"]:
                    skip = True
            if not skip:
                dstFiles.append( fil )
        
    def checkFile(self,fileName,dsetName,ifile):
        """
        Check if file is valid.
        @fileName: file name
        """
        ## fName = "root://eoscms//eos/cms%s" % fileName
        fName = fileName
        tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile)
        ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"]
        ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2]
        
        try:
            fout = open(tmp)
            out = fout.read()
            fout.close()
        except IOError, e:
            print ret, out 
            print e
            out = "{}"

        os.remove(tmp)
        return dsetName,ifile,fileName,ret,out