Example #1
0
class Spawner:
    def __init__(self):
        self.config = Config()
        client = pymongo.MongoClient(f"mongodb+srv://explorer:{self.config.MongoPass}@cluster0-eyzcm.mongodb.net/test?retryWrites=true&w=majority")
        self.db = client.plutoQ
        self.cm = ContainerManager()
        self.status = Status()
        
    def insertRef(self):
        nbDoc = nbformat.read(self.nbFileName, as_version=4)
        nbCells = nbDoc['cells']
        markdownCells = [x for x in nbCells if x['cell_type'] == 'markdown']
        
        hasRef = False
        for mdc in markdownCells:
            print (mdc['source'])
            if 'pluto.studio' in mdc['source']:
                hasRef = True
                break
    
        if not hasRef:
            outObj = nbformat.NotebookNode(cell_type='markdown', metadata={}, source=["This notebook was created using [pluto](http://pluto.studio). Learn more [here](https://github.com/shyams80/pluto)"])
            nbCells.append(outObj)
            nbformat.write(nbDoc, self.nbFileName, version=4)
            
    def getOutputLength(self):
        nbDoc = nbformat.read(self.nbFileName, as_version=4)
        textLength = 0
        for nbCell in nbDoc['cells']:
            if nbCell['cell_type'] != 'code':
                continue
    
            for nbOut in nbCell['outputs']:
                if nbOut['output_type'] != 'stream' or 'name' not in nbOut or nbOut['name'] != 'stdout':
                    continue
    
                textLength = textLength + len(nbOut['text'])
    
        return textLength
    
    def upsertGithub(self, diskFileName, githubFileName):
        print(f"upserting: {diskFileName} to {githubFileName}")
        with open(diskFileName, mode='rb') as file:
            outFileContent = file.read()
    
        try:
            fileContent = self.repo.get_contents(githubFileName)
            self.repo.update_file(githubFileName, "response", outFileContent, fileContent.sha)
        except Exception as exp:
            print(exp)
            if exp.status == 404:
                try:
                    self.repo.create_file(githubFileName, "response", outFileContent)
                except Exception as exp2:
                    print("Error creating file on github: " + githubFileName)
                    print(exp2)
        
    def Execute(self, meta):
        print(meta)
        
        qId = ObjectId(meta['id'])
        print('acquiring egg')
        self.status.Update(qId, 'acquiring egg')
        
        egg = self.cm.GetProcessor(qId, meta['githubUser'])
    
        request = self.db.q.find_one({'_id': qId})
        
        githubUserName = request['githubUser']
        print(f"processing for: {githubUserName}")
        self.status.Update(qId, 'processing')
        
        githubAcc = Github(request['githubTok'])
        user = githubAcc.get_user()
        self.repo = user.get_repo("plutons")
        
        self.plutoPath = "/home/pluto/notebook-temp/" + meta['id'] + "/"
        
        try:
            os.makedirs(self.plutoPath)
        except FileExistsError:
            pass
        
        fullPath = request['file']
        notebook = gzip.decompress(request['notebook'])
        
        githubFileName = fullPath[fullPath.rfind('/')+1:]
        githubPath = fullPath[:fullPath.rfind('/')]
    
        self.nbFileName = self.plutoPath + githubFileName
        print(f"processing notebook: {self.nbFileName}")
        with open(self.nbFileName, mode='wb') as file:
            file.write(notebook)
            
        cmdLine = f"jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace {self.nbFileName}"
        subprocess.run(shlex.split(cmdLine), env=os.environ, errors=True)
            
        self.insertRef()
        
        egg.files.recursive_put(self.plutoPath, "/home/pluto/")
        
        print(f"executing in egg")
        self.status.Update(qId, 'executing in egg')
        egg.execute(shlex.split(f"jupyter nbconvert --to notebook --execute /home/pluto/{githubFileName} --inplace --allow-errors --ExecutePreprocessor.timeout=1200"))
        
        resp = egg.files.get(f"/home/pluto/{githubFileName}")
        with open(self.nbFileName, mode='wb') as file:
            file.write(resp)
        
        textLength = self.getOutputLength()
        print(f"total output length: {textLength}")
    
        if githubUserName != 'shyams80' and textLength > 10000:
            cmdLine = f"jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace {self.nbFileName}"
            subprocess.run(shlex.split(cmdLine), env=os.environ, errors=True)
            nbDoc = nbformat.read(self.nbFileName, as_version=4)
    
            for nbCell in nbDoc['cells']:
                if nbCell['cell_type'] != 'code' and nbCell['source'] != None:
                    continue
    
                nbCell['execution_count'] = 1
                outObj = nbformat.NotebookNode(output_type='stream', name='stderr', text=['total output string length exceeded 10000 characters. please stay within the limit.'])
                nbCell['outputs'].append(outObj)
                break
    
            nbformat.write(nbDoc, self.nbFileName, version=4)
    
        with open(self.nbFileName, mode='rb') as file:
            outFileContent = file.read()
            
        tooBig = False
        try:
            fileContent = self.repo.get_contents(fullPath)
            self.repo.update_file(fullPath, "response", outFileContent, fileContent.sha)
        except Exception as exp:
            print(exp)
            if exp.data["errors"][0]['code'] == 'too_large':
                tooBig = True
    
        if tooBig:
            print("file is too big!")
            self.status.Update(qId, 'file is too big!')
            
            egg.execute(shlex.split(f"jupyter nbconvert --to markdown --execute /home/pluto/{githubFileName} --allow-errors --ExecutePreprocessor.timeout=1200"))
            
            filePattern = githubFileName.replace(".ipynb", "")
            
            self.status.Update(qId, 'creating markdown...')
            egg.execute(shlex.split(f"./tard.sh {filePattern}"))
            
            resp = egg.files.get(f"/home/pluto/{filePattern}.tar.gz")
            with open(f"{self.plutoPath}{filePattern}.tar.gz", mode='wb') as file:
                file.write(resp)
                
            subprocess.run(shlex.split(f"tar xvf {self.plutoPath}{filePattern}.tar.gz -C {self.plutoPath}"), env=os.environ, errors=True)
            
            self.status.Update(qId, 'uploading markdown...')    
            self.upsertGithub(f"{self.plutoPath}{filePattern}.md", f"{githubPath}/{filePattern}.md")
            
            if os.path.isdir(f"{self.plutoPath}{filePattern}_files"):
                self.status.Update(qId, 'uploading images...')
                for fname in os.listdir(f"{self.plutoPath}{filePattern}_files"):
                    self.upsertGithub(f"{self.plutoPath}{filePattern}_files/{fname}", f"{githubPath}/{filePattern}_files/" + fname)
    
        egg.files.delete(f"/home/pluto/{githubFileName}")
        shutil.rmtree(self.plutoPath)
    
        self.db.q.update_one({'_id': qId}, {'$set': {'isProcessed': True, 'processedOn': datetime.now(), 'notebook': gzip.compress(outFileContent)}})
        self.status.Update(qId, 'finished')
Example #2
0
class Enqueuer:
    def __init__(self):
        self.config = Config()
        self.metaQ = Queue('pluto',
                           connection=Redis('windows', 6379, db=1),
                           default_timeout=1 * 3600)
        self.status = Status()

        client = pymongo.MongoClient(
            f"mongodb+srv://explorer:{self.config.MongoPass}@cluster0-eyzcm.mongodb.net/test?retryWrites=true&w=majority"
        )
        self.db = client.plutoQ
        self.cm = ContainerManager()

    def Cleanup(self):
        cutoff = datetime.now() - timedelta(days=30)
        dmRet = self.db.q.delete_many({'createdOn': {'$lt': cutoff}})
        print(f"deleted {dmRet.deleted_count} from mongo queue")

        activeUsers = self.db.q.distinct('githubUser')
        self.cm.KeepOnly(activeUsers)

    def Queue(self, meta):
        self.metaQ.enqueue(Spawn, meta, result_ttl=0)

    def Loop(self):
        request = self.db.q.find_one({
            '$or': [{
                '$and': [{
                    'isEnqueued': {
                        '$exists': True
                    }
                }, {
                    'isEnqueued': False
                }]
            }, {
                '$and': [{
                    'isEnqueued': {
                        '$exists': False
                    }
                }, {
                    'isProcessed': False
                }]
            }]
        })
        if request == None:
            return

        qId = ObjectId(request['_id'])

        self.db.q.update_one(
            {'_id': qId},
            {'$set': {
                'isEnqueued': True,
                'enqueuedOn': datetime.now()
            }})
        self.status.Update(qId, 'queued')

        meta = {"id": str(request['_id']), "githubUser": request['githubUser']}

        #Spawn(meta)
        self.Queue(meta)

    def FailLoop(self):
        print(
            f"total number of failed jobs: {self.metaQ.failed_job_registry.count}"
        )

        for failedId in self.metaQ.failed_job_registry.get_job_ids():
            failedJob = self.metaQ.fetch_job(failedId)
            failedMeta = failedJob.args[0]
            print(failedMeta)
            qId = ObjectId(failedMeta['id'])
            self.db.q.update_one(
                {'_id': qId},
                {'$set': {
                    'isProcessed': True,
                    'processedOn': datetime.now()
                }})
            self.status.Update(qId, 'system failure. try again!')
            self.metaQ.remove(failedId)
            self.metaQ.failed_job_registry.remove(failedJob)
Example #3
0
class ContainerManager:
    def __init__(self):
        self.config = Config()
        #self.client = pylxd.Client(endpoint=self.config['DEFAULT']['LXD_URL'], cert=(self.config['DEFAULT']['LXD_CERT'], self.config['DEFAULT']['LXD_KEY']), verify=False)
        self.client = pylxd.Client()
        #client.authenticate('ferrari')

        self.status = Status()

    def DeleteProcessor(self, githubUserName):
        if githubUserName == "goose":
            raise Exception("maverick!")
        try:
            cntnr = self.client.containers.get(githubUserName)
            if cntnr.state().status_code != 102:
                cntnr.stop(wait=True)
            cntnr.delete(wait=True)
        except pylxd.exceptions.NotFound:
            pass

    def KeepOnly(self, githubUserNames):
        existingContainers = [n.name for n in self.client.containers.all()]
        existingContainers.remove('goose')
        toDelete = set(existingContainers).difference(set(githubUserNames))
        for td in toDelete:
            self.DeleteProcessor(td)

    def GetProcessor(self, qId, githubUserName):
        if githubUserName == "goose":
            raise Exception("maverick!")

        egg = None
        try:
            #first check if goose's egg exists
            images = self.client.images.all()
            templates = [
                t for t in images
                if len(t.aliases) > 0 and t.aliases[0]['name'] == 'goose'
            ]

            #the goose image is deleted if there has been an update to pluto or new libraries are installed
            if len(templates) == 0:
                self.status.Update(qId,
                                   'cloning the goose... can take a while')
                goose = self.client.containers.get('goose')
                if goose.state().status_code != 102:
                    goose.stop(wait=True)
                template = goose.publish(public=True, wait=True)
                template.add_alias('goose', 'lays golden eggs')
                goose.start(wait=True)
                self.status.Update(qId, 'goose is loose!')

                images = self.client.images.all()
                templates = [
                    t for t in images
                    if len(t.aliases) > 0 and t.aliases[0]['name'] == 'goose'
                ]

            template = templates[0]
            egg = self.client.containers.get(githubUserName)

            #if the egg was created before the goose image, delete it and re-create
            if egg.created_at < template.uploaded_at:
                self.DeleteProcessor(githubUserName)
                raise pylxd.exceptions.NotFound(response="redo")

            if egg.state().status_code == 102:
                egg.start(wait=True)
            self.status.Update(qId, 'egg initialized')
        except pylxd.exceptions.NotFound:
            #create an egg from the goose image
            dna = {
                "ephemeral": False,
                "name": githubUserName,
                "source": {
                    "type": "image",
                    "certificate": "",
                    "fingerprint": template.fingerprint
                }
            }

            self.status.Update(qId, 'laying an egg... can take a while')
            egg = self.client.containers.create(dna, wait=True)
            egg.start(wait=True)
            self.status.Update(qId, 'egg laid')

        #update the hosts file
        hosts = "127.0.0.1 localhost\n"
        for server in self.config['DEFAULT']['SERVERS'].split(','):
            lup = socket.gethostbyname(server)
            hosts = hosts + f"{lup} {server}\n"

        egg.files.put("/etc/hosts", hosts)

        #setup the firewall
        ret = egg.execute(["/root/start.sh"])
        print(ret.exit_code)

        self.status.Update(qId, 'egg initialized')
        return egg