Python bigjob Exemples, bigjob.bigjob Python Exemples

Exemple #1

0

Afficher le fichier

    def __start_bigjob(self, bj_dict):
        """ private method - starts a bigjob on the defined resource """
        gram_url = bj_dict["resource_url"]
        logging.debug("start bigjob at: " + gram_url)
        bj = bigjob.bigjob(self.advert_host)
        ppn="1"
        if ("processes_per_node" in bj_dict):
            ppn=bj_dict["processes_per_node"]
        else:
            bj_dict["processes_per_node"]="1"

        walltime = 3600
        if ("walltime" in bj_dict):
            walltime=bj_dict["walltime"]

        working_directory = (os.getcwd()+"/agent")
        if ("working_directory" in bj_dict):
            working_directory=bj_dict["working_directory"]

        bj.start_pilot_job(gram_url,
                           bj_dict["bigjob_agent"],
                           bj_dict["number_nodes"],
                           bj_dict["queue"],
                           bj_dict["allocation"],
                           working_directory, 
                           None,
                           walltime,
                           ppn)
        bj_dict["bigjob"]=bj # store bigjob for later reference in dict
        bj_dict["free_cores"]=int(bj_dict["number_nodes"])*int(ppn)
        bj_dict["to_be_terminated"]=False
        # lock for modifying the number of free nodes
        bj_dict["lock"] = threading.Lock()

Exemple #2

0

Afficher le fichier

Fichier : pilotcompute_manager.py Projet : mvrk/BigJob

    def __init__(self, pilot_compute_service=None, 
                       bigjob_object=None, 
                       pilot_compute_description=None,
                       pilot_url=None): # for reconnecting
        
        """ Create/reconnect to a Pilot Compute.  

            Keyword arguments:
            pilot_url   -- restore from cp_id

            The implementation will attempt to reconnect to the PC instance
            referenced by the pilot_url.  

        """        

        self.__subjobs = []
        self.__pilot_compute_service = None
        if pilot_url==None:
            logger.debug("Create PilotCompute for BigJob: " + str(bigjob_object))
            self.pilot_compute_description=pilot_compute_description
            self.__pilot_compute_service=pilot_compute_service
            self.__bigjob = bigjob_object        
        else:
            logger.debug("Reconnect to an existing Pilot Compute")
            self.__bigjob = bigjob(pilot_url=pilot_url)
        
        # Store the URL of pilot compute service for later reference
        # This URL is used as central queue for a set of BJs in the
        # ComputeDataServiceDecentral
        if self.__pilot_compute_service!=None:
            self.coordination_queue = pilot_compute_service.coordination_queue

Exemple #3

0

Afficher le fichier

Fichier : example_gce_single.py Projet : ashleyz/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue=None # if None default queue is used
    project=None # if None default allocation is used 
    walltime=10
    processes_per_node=1
    number_of_processes = 1
    workingdirectory="." # working directory for agent
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)

        
    #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE
    lrms_url = "gce+ssh://locahost"
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/echo"
    #jd.executable = "$HOME/hello.sh"
    jd.number_of_processes = "1"
    jd.arguments = ["$HELLOWORLD"]
    jd.environment = ['HELLOWORLD=hello_world']
    jd.input_data = ["hi", "ho"]
    
    # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox
    #jd.working_directory = "/tmp" 
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"
    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)
    
    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #4

0

Afficher le fichier

    def __start_bigjob(self, bj_dict):
        """ private method - starts a bigjob on the defined resource """
        gram_url = bj_dict["resource_url"]
        logger.debug("start bigjob at: " + gram_url)
        bj = bigjob(self.coordination_url)
        ppn = "1"
        if ("processes_per_node" in bj_dict):
            ppn = bj_dict["processes_per_node"]
        else:
            bj_dict["processes_per_node"] = "1"

        walltime = 3600
        if ("walltime" in bj_dict):
            walltime = bj_dict["walltime"]

        working_directory = None
        if ("working_directory" in bj_dict):
            working_directory = bj_dict["working_directory"]

        bj_filetransfer = None
        if ("file_transfer" in bj_dict):
            bj_filetransfer = bj_dict["file_transfer"]

        bj.start_pilot_job(
            lrms_url=gram_url,
            number_nodes=bj_dict["number_of_processes"],
            queue=bj_dict["queue"],
            project=bj_dict["allocation"],
            working_directory=working_directory,
            walltime=walltime,
            processes_per_node=ppn,
            filetransfers=bj_filetransfer,
            external_queue=self.coordination_queue,
            pilot_compute_description=bj_dict["pilot_compute_description"])
        return bj

Exemple #5

0

Afficher le fichier

    def __init__(self,
                 pilot_compute_service=None,
                 bigjob_object=None,
                 pilot_compute_description=None,
                 pilot_url=None):  # for reconnecting
        """ Create/reconnect to a Pilot Compute.  

            Keyword arguments:
            pilot_url   -- restore from cp_id

            The implementation will attempt to reconnect to the PC instance
            referenced by the pilot_url.  

        """
        self.__subjobs = []
        self.__pilot_compute_service = None
        if pilot_url == None:
            logger.debug("Create PilotCompute for BigJob: " +
                         str(bigjob_object))
            self.pilot_compute_description = pilot_compute_description
            self.__pilot_compute_service = pilot_compute_service
            self.__bigjob = bigjob_object
        else:
            logger.debug("Reconnect to an existing Pilot Compute")
            self.__bigjob = bigjob(pilot_url=pilot_url)

        # Store the URL of pilot compute service for later reference
        # This URL is used as central queue for a set of BJs in the
        # ComputeDataServiceDecentral
        if self.__pilot_compute_service != None:
            self.coordination_queue = pilot_compute_service.coordination_queue

Exemple #6

0

Afficher le fichier

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue = None  # if None default queue is used
    project = None  # if None default allocation is used
    walltime = 10
    processes_per_node = 8
    number_nodes = 24
    workingdirectory = os.getcwd()  # working directory for agent
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "pbs://localhost"  # resource url to run the jobs on localhost

    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job(lrms_url, number_nodes, queue, project,
                       workingdirectory, userproxy, walltime,
                       processes_per_node)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
        bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/hostname"
    jd.number_of_processes = "2"
    jd.spmd_variation = "single"
    jd.arguments = [""]
    #jd.working_directory = "/tmp"
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"

    for i in range(0, 12):
        sj = subjob()
        sj.submit_job(bj.pilot_url, jd)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.wait()
    bj.cancel()

Exemple #7

0

Afficher le fichier

Fichier : example_gce_single.py Projet : saga-project/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue = None  # if None default queue is used
    project = None  # if None default allocation is used
    walltime = 10
    processes_per_node = 1
    number_of_processes = 1
    workingdirectory = "."  # working directory for agent
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)

    #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE
    lrms_url = "gce+ssh://locahost"
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job(lrms_url, number_of_processes, queue, project,
                       workingdirectory, userproxy, walltime,
                       processes_per_node)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
        bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/echo"
    #jd.executable = "$HOME/hello.sh"
    jd.number_of_processes = "1"
    jd.arguments = ["$HELLOWORLD"]
    jd.environment = ['HELLOWORLD=hello_world']
    jd.input_data = ["hi", "ho"]

    # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox
    #jd.working_directory = "/tmp"
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"
    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)

    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if (state == "Failed" or state == "Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #8

0

Afficher le fichier

Fichier : pilotcompute_manager.py Projet : mvrk/BigJob

    def __start_bigjob(self, bj_dict):
        """ private method - starts a bigjob on the defined resource """
        gram_url = bj_dict["resource_url"]
        logger.debug("start bigjob at: " + gram_url)
        bj = bigjob(self.coordination_url)

        if("processes_per_node" not in bj_dict or bj_dict["processes_per_node"] == 'None'):
            ppn="1"
        else:
            ppn=bj_dict["processes_per_node"]

        walltime = 3600
        if ("walltime" in bj_dict):
            walltime=bj_dict["walltime"]

        working_directory = None
        if ("working_directory" in bj_dict):
            working_directory=bj_dict["working_directory"]
            
        bj_filetransfer = None
        if ("file_transfer" in bj_dict):
            bj_filetransfer = bj_dict["file_transfer"]


        bj.start_pilot_job(lrms_url = gram_url,
                           number_nodes = int(bj_dict["number_of_processes"]),
                           queue = bj_dict["queue"],
                           project = bj_dict["project"],
                           working_directory = working_directory, 
                           walltime = walltime,
                           processes_per_node = ppn,
                           filetransfers = bj_filetransfer,
                           spmd_variation = bj_dict["spmd_variation"],
                           external_queue = self.coordination_queue,
                           pilot_compute_description = bj_dict["pilot_compute_description"]
                           )
        return bj

Exemple #9

0

Afficher le fichier

Fichier : example_condor_single.py Projet : ashleyz/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue=None # if None default queue is used
    project=None # if None default allocation is used 
    walltime=10
    processes_per_node=4
    number_of_processes = 8
    workingdirectory= os.path.join(os.getcwd(), "agent")
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)

    
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "condor://localhost"

    ##########################################################################################


    input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt")
    bj_filetransfers = [input_file +" > test.txt"]
  
    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        None,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node,
                        bj_filetransfers)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()

    jd.executable = "/bin/cat"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    jd.arguments = ["test.txt"]
    #jd.working_directory = "" 
    jd.output = "sj-stdout.txt"
    jd.error = "sj-stderr.txt"    

    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)
    
    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        bj_state = bj.get_state()
        print "bj state: " + str(bj_state) + " state: " + state
        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #10

0

Afficher le fichier

        return False


""" Test Job Submission via Advert """
if __name__ == "__main__":

    starttime = time.time()

    if len(sys.argv) == 2:
        reconnect_url = sys.argv[1]
    else:
        print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>"
        sys.exit(-1)

    print "Reconnect to Pilot Job/BigJob at: " + reconnect_url
    bj = bigjob(pilot_url=reconnect_url)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
        bj.get_state() + " Number of SJs: " + str(len(bj.list_subjobs())))

    ##########################################################################################
    # Submit some more subjobs
    if bj.get_state() != "Done":
        jobs = []
        job_start_times = {}
        job_states = {}
        for i in range(0, NUMBER_JOBS):
            jd = description()
            jd.executable = "/bin/date"
            jd.number_of_processes = "1"
            jd.spmd_variation = "single"

Exemple #11

0

Afficher le fichier

Fichier : 1bj_1m.py Projet : ssarip1/async-re

        temps.append(temp)

    ##################################################################################
    # Start BigJob
    # Parameter for BigJob
    # bigjob_agent = os.getcwd() + "/bigjob_agent_launcher.sh" # path to agent
    # bigjob_agent = "/bin/echo"
    nodes = BIGJOB_SIZE  # number nodes for agent
    # workingdirectory=os.getcwd() +"/agent"  # working directory for agent
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)

    bjs = []
    i = 0
    NUMBER_BIGJOBS = int(NUMBER_BIGJOBS)
    for i in range(0, NUMBER_BIGJOBS):
        bj = bigjob.bigjob(advert_host)
        bjs.append(bj)
        if i == 0:
            queue = "workq"
            project = "loni_jhabig10"
            bigjob_agent = WORK_DIR + "/bigjob_agent_launcher.sh"  # path to agent
            workingdirectory = WORK_DIR + "/agent"  # working directory for agent
            lrms_url = "gram://" + HOST + "/jobmanager-pbs"
        elif i == 1:
            queue = "development"
            project = "TG-MCB090174"
            bigjob_agent = WORK_DIR1 + "/bigjob_agent_launcher.sh"  # path to agent
            workingdirectory = WORK_DIR1 + "/agent"  # working directory for agent
            lrms_url = "gram://" + REMOTE1 + "/jobmanager-sge"
        elif i == 2:
            bigjob_agent = WORK_DIR2 + "/bigjob_agent_launcher.sh"  # path to agent

Exemple #12

0

Afficher le fichier

Fichier : example_1_BigJob_N_SubJobs.py Projet : ssarip1/BigJob

    ##########################################################################################
    # make sure you are familiar with the queue structure on futuregrid,ppn, your project id
    # and the walltime limits on each queue. change accordingly
    # 
    queue="batch"          # Queue to which BigJob has to be submitted, if None, default queue is considered.
    project=None            # Allocation Information. if None, default information is considered
    walltime=60             # Time in minutes. There are limits on the time you can request

    processes_per_node=8    # ppn
    number_of_processes=24  # The total number of processes ( BigJob size), used to run Jobs 
    workingdirectory= os.path.join(os.getcwd(), "agent") # working directory for agent. 
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + RESOURCEMGR_URL
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( RESOURCEMGR_URL,
                        None,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        None,
                        walltime,
                        processes_per_node)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    jobs = []
    job_start_times = {}

Exemple #13

0

Afficher le fichier

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue = None  # if None default queue is used
    project = None  # if None default allocation is used
    walltime = 10
    processes_per_node = 4
    number_of_processes = 8
    workingdirectory = os.path.join(os.getcwd(),
                                    "agent")  # working directory for agent
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    #lrms_url = "fork://localhost" # resource url to run the jobs on localhost
    lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs"

    #lrms_url = "ssh://[email protected]"
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project,
                       workingdirectory, userproxy, walltime,
                       processes_per_node)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
        bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()

    jd.executable = "/bin/date"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    #jd.arguments = ["match -f  bgr1.fa -A 0  -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"]
    jd.arguments = [""]
    #jd.working_directory = ""
    jd.output = "bfast-stdout.txt"
    jd.error = "bfast-stderr.txt"

    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)

    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if (state == "Failed" or state == "Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #14

0

Afficher le fichier

Fichier : example_condorg_single.py Projet : ashleyz/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue=None # if None default queue is used
    project=None # if None default allocation is used 
    walltime=10
    processes_per_node=4
    number_of_processes = 8
    workingdirectory=os.path.join(os.getcwd(), "agent")  # working directory for agent
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)

    
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    #lrms_url = "fork://localhost" # resource url to run the jobs on localhost
    lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs"

    #lrms_url = "ssh://[email protected]" 
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        None,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()

    jd.executable = "/bin/date"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    #jd.arguments = ["match -f  bgr1.fa -A 0  -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"]
    jd.arguments = [""]
    #jd.working_directory = "" 
    jd.output = "bfast-stdout.txt"
    jd.error = "bfast-stderr.txt"    

    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)
    
    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #15

0

Afficher le fichier

Fichier : example_single_filestaging_globusonline.py Projet : saga-project/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue=None # if None default queue is used
    project=None # if None default allocation is used 
    walltime=10
    processes_per_node=4
    number_of_processes = 8
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)
    
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "sge-ssh://lonestar.tacc.teragrid.org"
    
    """
        To use Globus Online the working directory must be specified using the following conventions
    """ 
    workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/"
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    
    bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) 
                        + "/test.txt > BIGJOB_WORK_DIR"]
    
    
    bj.start_pilot_job( lrms_url,
                        None,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node,
                        bj_filetransfers)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/cat"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    jd.arguments = ["test.txt"]
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"
    jd.file_transfer = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) 
                       + "/test.txt > SUBJOB_WORK_DIR"]
    
    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)
    
    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #16

0

Afficher le fichier

Fichier : example_kraken_single.py Projet : mvrk/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue=None # if None default queue is used
    project=None # if None default allocation is used 
    walltime=600
    processes_per_node=12
    number_of_processes=24
    workingdirectory="/lustre/scratch/aluckow/agent"  # working directory for agent
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)

    
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "xt5torque://localhost" # resource url to run the jobs on localhost
   
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/hostname"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    jd.arguments = [""]
    #jd.working_directory = "/tmp" 
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"

    sjs = []
    for i in range(0,24):
        sj = subjob()
        sj.submit_job(bj.pilot_url, jd)
        sjs.append(sj)

    
    
    
    #########################################
    # busy wait for completion
    while 1:
        for idx, sj in enumerate(sjs):
            state = str(sj.get_state())
            print "sj: %d state: %s"%(idx,state)

        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #17

0

Afficher le fichier

Fichier : example_2_BigJobs_N_SubJobs.py Projet : ssarip1/BigJob

    ##########################################################################################
    # make sure you are familiar with the queue structure on futuregrid,ppn, your project id
    # and the walltime limits on each queue. change accordingly
    # 

    RESOURCEMGR_URL = "pbs-ssh://username@Target_machines_host_name"
    queue"normal"          # Queue information to which BigJob has to be submitted. if None, default queue is considered.
    project=None            # Allocation information 
    walltime=30             # Time in minutes. There are limits on the time you can request. 
    processes_per_node=8    # ppn
    number_of_processes=12  # The total number of processes ( BigJob size), used to run SubJobs 
    workingdirectory= os.path.join(os.getcwd(), "agent") # working directory for agent. 
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + RESOURCEMGR_URL
    bj1 = bigjob(COORDINATION_URL)
    bj1.start_pilot_job( RESOURCEMGR_URL,
                        None,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        None,
                        walltime,
                        processes_per_node)

    print "Pilot Job/BigJob URL: " + bj1.pilot_url + " State: " + str(bj1.get_state())

    ##########################################################################################
    #Start 2nd BigJob
    ##########################################################################################

Exemple #18

0

Afficher le fichier

Fichier : example_local_multiple_reconnect.py Projet : ssarip1/BigJob

            return False


""" Test Job Submission via Advert """
if __name__ == "__main__":

    starttime=time.time()
    
    if len(sys.argv)==2:
        reconnect_url=sys.argv[1]
    else:
        print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>"
        sys.exit(-1)

    print "Reconnect to Pilot Job/BigJob at: " + reconnect_url
    bj = bigjob(COORDINATION_URL, reconnect_url)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() 
           + " Number of SJs: " + str(len(bj.list_subjobs())) )

    ##########################################################################################
    # Submit some more subjobs
    if bj.get_state()!="Done":
        jobs = []
        job_start_times = {}
        job_states = {}
        for i in range(0, NUMBER_JOBS):
            jd = description()
            jd.executable = "/bin/date"
            jd.number_of_processes = "1"
            jd.spmd_variation = "single"

Exemple #19

0

Afficher le fichier

Fichier : example_local_multiple.py Projet : saga-project/BigJob

    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "fork://localhost" 
    
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node)

    print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jobs = []
    job_start_times = {}

Exemple #20

0

Afficher le fichier

        t = t + 10
        temps.append(temp)

##################################################################################
# Start BigJob
# Parameter for BigJob
    bigjob_agent = os.getcwd() + "/bigjob_agent_launcher.sh"  # path to agent
    #bigjob_agent = "/bin/echo"
    nodes = BIGJOB_SIZE  # number nodes for agent
    workingdirectory = os.getcwd() + "/agent"  # working directory for agent
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)

    bjs = []
    i = 0
    for i in range(0, NUMBER_BIGJOBS):
        bj = bigjob.bigjob(advert_host)
        bjs.append(bj)
        if (i == 0):
            lrms_url = "gram://" + HOST + "/jobmanager-pbs"
        elif (i == 1):
            lrms_url = "gram://" + REMOTE1 + "/jobmanager-pbs"
        elif (i == 2):
            lrms_url = "gram://" + REMOTE2 + "/jobmanager-pbs"
        else:
            lrms_url = "gram://" + REMOTE3 + "/jobmanager-pbs"
        bjs[i].start_pilot_job(lrms_url, bigjob_agent, nodes, None, None,
                               workingdirectory, userproxy, 150)
        print "Start Pilot Job/BigJob: " + bigjob_agent + " at: " + lrms_url
        print "Pilot Job/BigJob URL: " + bjs[i].pilot_url + " State: " + str(
            bjs[i].get_state())
        print "####################" + time.asctime(time.localtime(time.time(

Exemple #21

0

Afficher le fichier

Fichier : example_local_single.py Projet : ashleyz/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue="normal" # if None default queue is used
    project=None # if None default allocation is used 
    walltime=10
    processes_per_node=4
    number_of_processes = 8
    #workingdirectory=os.path.join(os.getcwd(), "agent")  # working directory for agent
    workingdirectory="agent"
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)

    
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "fork://localhost" # resource url to run the jobs on localhost
    #lrms_url = "sge://localhost" # resource url to run the jobs on localhost
    #lrms_url = "ssh://localhost" # resource url to run the jobs on localhost
   
    ##########################################################################################

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job( lrms_url,
                        number_of_processes,
                        queue,
                        project,
                        workingdirectory,
                        userproxy,
                        walltime,
                        processes_per_node)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()
    jd.executable = "/bin/echo"
    #jd.executable = "$HOME/hello.sh"
    jd.number_of_processes = "1"
    jd.arguments = ["$HELLOWORLD"]
    jd.environment = ['HELLOWORLD=hello_world']
    #jd.spmd_variation = "mpi"
    
    # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox
    #jd.working_directory = "/tmp" 
    jd.output = "stdout.txt"
    jd.error = "stderr.txt"
    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)
    
    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        print "state: " + state
        if(state=="Failed" or state=="Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #22

0

Afficher le fichier

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue = "normal"  # if None default queue is used
    project = None  # if None default allocation is used
    walltime = 10
    processes_per_node = 4
    number_of_processes = 8
    #workingdirectory=os.path.join(os.getcwd(), "agent")  # working directory for agent
    workingdirectory = "agent"
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:

    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.

    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "ssh://boskop"
    #lrms_url = "sge://localhost"
    #lrms_url = "fork://localhost"

    ##########################################################################################

    # for i in range(99999):
    #     js = saga.job.Service (lrms_url)
    #     j  = js.run_job ("/bin/sleep 1000")
    #     print "%4d: %s" % (i, j.state)

    for i in range(99999):
        print i

        print "Start Pilot Job/BigJob at: " + lrms_url
        bj = bigjob(COORDINATION_URL)
        bj.start_pilot_job(lrms_url, number_of_processes, queue, project,
                           workingdirectory, userproxy, walltime,
                           processes_per_node)

        print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
            bj.get_state())

        ##########################################################################################
        # Submit SubJob through BigJob
        jd = description()
        jd.executable = "/bin/echo"
        #jd.executable = "$HOME/hello.sh"
        jd.number_of_processes = "1"
        jd.arguments = ["$HELLOWORLD"]
        jd.environment = ['HELLOWORLD=hello_world']
        #jd.spmd_variation = "mpi"

        # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox
        #jd.working_directory = "/tmp"
        jd.output = "stdout.txt"
        jd.error = "stderr.txt"

        sj = subjob()
        sj.submit_job(bj.pilot_url, jd)

        #########################################
        # busy wait for completion
        while 1:
            state = str(sj.get_state())
            print "state: " + state
            if (state == "Failed" or state == "Done"):
                break
            time.sleep(2)

        ##########################################################################################
        # Cleanup - stop BigJob
        bj.cancel()

Exemple #23

0

Afficher le fichier

Fichier : example_condor_single.py Projet : saga-project/BigJob

def main():
    # Start BigJob

    ##########################################################################################
    # Edit parameters for BigJob
    queue = None  # if None default queue is used
    project = None  # if None default allocation is used
    walltime = 10
    processes_per_node = 4
    number_of_processes = 8
    workingdirectory = os.path.join(os.getcwd(), "agent")
    userproxy = None  # userproxy (not supported yet due to context issue w/ SAGA)
    """ 
    URL of the SAGA Job Service that is used to dispatch the pilot job.
    The following URLs are accepted:
    
    lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI)
    lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines)
    lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system.
    lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine.
    lrms_url = "xt5torque://localhost" # torque resource url.
    
    Please ensure that the respective SAGA adaptor is installed and working
    """
    lrms_url = "condor://localhost"

    ##########################################################################################

    input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                              "test.txt")
    bj_filetransfers = [input_file + " > test.txt"]

    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(COORDINATION_URL)
    bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project,
                       workingdirectory, userproxy, walltime,
                       processes_per_node, bj_filetransfers)

    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(
        bj.get_state())

    ##########################################################################################
    # Submit SubJob through BigJob
    jd = description()

    jd.executable = "/bin/cat"
    jd.number_of_processes = "1"
    jd.spmd_variation = "single"
    jd.arguments = ["test.txt"]
    #jd.working_directory = ""
    jd.output = "sj-stdout.txt"
    jd.error = "sj-stderr.txt"

    sj = subjob()
    sj.submit_job(bj.pilot_url, jd)

    #########################################
    # busy wait for completion
    while 1:
        state = str(sj.get_state())
        bj_state = bj.get_state()
        print "bj state: " + str(bj_state) + " state: " + state
        if (state == "Failed" or state == "Done"):
            break
        time.sleep(2)

    ##########################################################################################
    # Cleanup - stop BigJob
    bj.cancel()

Exemple #24

0

Afficher le fichier

Fichier : example_local_multiple_reconnect.py Projet : ashleyz/BigJob

            return False


""" Test Job Submission via Advert """
if __name__ == "__main__":

    starttime=time.time()
    
    if len(sys.argv)==2:
        reconnect_url=sys.argv[1]
    else:
        print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>"
        sys.exit(-1)

    print "Reconnect to Pilot Job/BigJob at: " + reconnect_url
    bj = bigjob(pilot_url=reconnect_url)
    
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() 
           + " Number of SJs: " + str(len(bj.list_subjobs())) )

    ##########################################################################################
    # Submit some more subjobs
    if bj.get_state()!="Done":
        jobs = []
        job_start_times = {}
        job_states = {}
        for i in range(0, NUMBER_JOBS):
            jd = description()
            jd.executable = "/bin/date"
            jd.number_of_processes = "1"
            jd.spmd_variation = "single"

Exemple #25

0

Afficher le fichier


    # ###################################################################################
    # TG/LONI Pilot Job
    # Parameter for BigJob
    re_agent = "/home/luckow/src/bigjob/bigjob_agent_launcher.sh" # path to agent
    nodes = 8  # number nodes for agent
    lrms_url = "gram://poseidon1.loni.org/jobmanager-pbs" # resource url
    #lrms_url = "gram://qb1.loni.org/jobmanager-fork" # resource url
    project = "" #allocation
    queue = "checkpt" # queue (PBS)
    workingdirectory="/home/luckow/"  # working directory
    userproxy = None # userproxy (not supported yet due to context issue w/ SAGA)


    bj_tg = bigjob.bigjob(advert_host)
    if NUMBER_JOBS_GRID>0:
        print "Start Pilot Job/BigJob in TG at: " + lrms_url
        bj_tg.start_pilot_job(lrms_url,
                               re_agent,
                               nodes,
                               queue,
                               project,
                               workingdirectory,
                               userproxy,
                               "20")

    ##########################################################################################
    # Submit SubJob through BigJob
    # NAMD command: $NAMD_HOME/charmrun ++verbose ++remote-shell ssh ++nodelist nodefile +p4 /usr/local/namd2/namd2 NPT.conf
    # working directory: $HOME/run

Exemple #26

0

Afficher le fichier

def load_test(bfast_jobs, run_id=0):
    starttime=time.time()
    workingdirectory=os.getcwd() +"/agent"  # working directory for agent
    print "\n**************************************************************************************************************************************************\n"
    print "START TEST %d Specification: %s"%(run_id, str(bfast_jobs))
    print "\n**************************************************************************************************************************************************\n"      
          
    for i in bfast_jobs.keys():
        resource = bfast_jobs[i]
        number_nodes = resource["number_nodes"]
        number_cores_per_node= resource["number_cores_per_node"]
        number_jobs = resource["number_subjobs"]
        lrms_url = resource["lrms_url"]
        workingdirectory = resource["working_directory"]
        print ("BJ: %s- #nodes:%d, #cores/node:%d, #jobs: %d, coordination-url:%s, lrms-url:%s"%
          (lrms_url, number_nodes, number_cores_per_node, number_jobs, COORDINATION_URL, lrms_url))
        
            # start pilot job (bigjob_agent)
        print "Start Pilot Job/BigJob at: " + lrms_url
        bj = bigjob(COORDINATION_URL)
        bj.start_pilot_job(lrms_url=lrms_url,
                           number_nodes=number_nodes,
                           processes_per_node=number_cores_per_node,
                           working_directory=workingdirectory
                          )
        resource["bigjob"]=bj
        resource["queueing_time"]=None
        resource["subjobs"]=[]
            
    subjob_submission_time = None    
    
    all_running = False
    while all_running == False:
        all_running = get_bj_states(bfast_jobs, starttime)
        if all_running == False: time.sleep(1)
    

    ##########################################################################################
    # Submit SubJob through BigJob
    job_start_times = {}
    job_states = {}
    jobs = []
    total_number_of_jobs=0
    for i in bfast_jobs.keys():
        resource = bfast_jobs[i]
        bj = resource["bigjob"]
        subjobs = resource["subjobs"]
        if i == "kraken":
            jd = saga.job.description()
            #jd.executable = "/bin/date"
            jd.executable = "time"
            jd.number_of_processes = "2"
            jd.spmd_variation = "single"
            jd.arguments = [""]
            jd.arguments = [resource["bfast_exe"], "match", 
                            "-f",  resource["bfast_ref_genome"] ,
                            "-A 1", 
                            "-r",  resource["bfast_reads"],
                            "-n" ,"1" ,
                            "-T" , resource["bfast_tmp_dir"]
                            ]  
            
            jd.environment=["NUMBER_SUBJOBS="+str(resource["number_aprun_subjobs"])]
            jd.output = "bfast-stdout.txt"
            jd.error = "bfast-stderr.txt"

            sj = subjob()
            sj.submit_job(bj.pilot_url, jd)
            
            total_number_of_jobs = total_number_of_jobs + 1
            
            subjobs.append(sj)
            jobs.append(sj)
            job_start_times[sj]=time.time()
            job_states[sj] = sj.get_state()
        
        else:
            for i in range(0, resource["number_subjobs"]):
                jd = saga.job.description()
                #jd.executable = "/bin/date"
                jd.executable = "time"
                jd.number_of_processes = "1"
                jd.spmd_variation = "single"
                jd.arguments = [""]
                jd.arguments = [resource["bfast_exe"], "match",  
                                "-f",  resource["bfast_ref_genome"] , 
                                "-A 1",  
                                "-r",  resource["bfast_reads"],
                                "-n" ,"1" ,
                                "-T" , resource["bfast_tmp_dir"]
                                ]  
                if resource.has_key("bfast_library_path"):
                    jd.environment=["LD_LIBRARY_PATH="+resource["bfast_library_path"]]

                jd.output = "bfast-stdout.txt"
                jd.error = "bfast-stderr.txt"

                sj = subjob()
                sj.submit_job(bj.pilot_url, jd)
                
                total_number_of_jobs = total_number_of_jobs + 1
                
                subjobs.append(sj)
                jobs.append(sj)
                job_start_times[sj]=time.time()
                job_states[sj] = sj.get_state()
        
        all_running = get_bj_states(bfast_jobs, starttime)

    subjob_submission_time = time.time()-starttime
    
    # busy wait for completion
    total_number_subjobs = 0
    for i in bfast_jobs.keys():
        resource = bfast_jobs[i]
        number_sj = resource["number_subjobs"]
        total_number_subjobs = total_number_subjobs + number_sj

    while 1:        
        all_running = get_bj_states(bfast_jobs, starttime)
        for i in bfast_jobs.keys():
            resource = bfast_jobs[i]
            bj = resource["bigjob"]
            subjobs = resource["subjobs"]
            number_subjobs_in_bigjob = resource["number_subjobs"]
            finish_counter=0
            result_map = {"Failed":0, "Done":0}      
            for i in subjobs: # iterate through sub-jobs
                old_state = job_states[i]
                state = i.get_state()
                if result_map.has_key(state)==False:
                    result_map[state] = 1
                else:
                    result_map[state] = result_map[state] + 1                
                if old_state != state:
                    print "Job " + str(i) + " changed from: " + old_state + " to " + state
                if old_state != state and has_finished(state)==True:
                    job_runtime = time.time()-job_start_times[i]
                    print "Job: " + str(i) + " Runtime: " + str(job_runtime) + " s."
                if has_finished(state)==True:
                    finish_counter = finish_counter + 1               
                    
                job_states[i]=state
            
            print "BJ: " + str(bj) + " Result: " + str(result_map)
            if number_subjobs_in_bigjob == result_map["Done"]+result_map["Failed"] and resource.has_key("completion_time")==False:
                bj_completion_time = time.time() - starttime
                resource["completion_time"] = bj_completion_time
                print "BJ: " + str(bj) + " Result: " + str(result_map) + " Time: " + str(bj_completion_time)
                
            print "BJ: %s State: %s; %d/%d jobs finished"%(bj, bj.get_state_detail(), finish_counter, number_subjobs_in_bigjob)
            
            
        # check whether all BJs are finished
        #finished_bj = 0
        #for i in bfast_jobs.keys():
        #   resource = bfast_jobs[i]
        #   if resource.has_key("completion_time"):
        #       finished_bj = finished_bj + 1 

        #if finished_bj == len(bfast_jobs.keys()):
        #    break
    
        print("%d/%d finished"%(finish_counter, total_number_subjobs)) 
        if finish_counter == total_number_subjobs:
            break

        time.sleep(2)

    runtime = time.time()-starttime

    results = ""   
    for i in bfast_jobs.keys():
        resource = bfast_jobs[i]
        bj = resource["bigjob"]
        number_nodes = resource["number_nodes"]
        number_cores_per_node = resource["number_cores_per_node"]
        number_jobs = resource["number_subjobs"]
        bj_runtime = resource["completion_time"]
        queueing_time = resource["queueing_time"]        
        lrms_url = resource["lrms_url"]
        print ("Run\tBJ\t#Nodes\t#cores/node\t#jobs\tQueuing Time\tBJ Runtime\tTotal Runtime\tCoordination URL\tLRMS URL")
        result_tuple = (run_id, str(bj), number_nodes, number_cores_per_node, number_jobs, 
                str(queueing_time), str(bj_runtime), str(runtime), COORDINATION_URL, lrms_url)
        results = results + ("%d,%s,%d,%d,%d,%s,%s,%s,%s,%s\n"%(result_tuple))        
        # print results
        result_tab = ("%d\t%s\t%d\t%d\t%d\t%s\t%s\t%s,\t%s\t%s"%(result_tuple))
        print result_tab
    
    # Cleanup - stop BigJob
    for i in bfast_jobs.keys():
        resource = bfast_jobs[i]
        bj = resource["bigjob"]
        bj.cancel()
    
    # hack: delete manually pbs jobs of user
    #os.system("qstat -u `whoami` | grep -o ^[0-9]* |xargs qdel")
    return results