Ejemplo n.º 1
0
    combinations = [
        comb + [f"--checkpoint_dir={args.checkpoint_dir}"]
        for comb in combinations
    ]

    if args.tasks > 1:
        # ports should be in between 2**10 and 2**16, but we'll start from some random
        # distant offset and use only a part of the space
        # hopefully we'll never have 2**15 jobs on the same node
        combinations = [
            comb + [f"--distributed_port={i % (2**15) + 18363}"]
            for (i, comb) in enumerate(combinations)
        ]

    runner = SlurmWrapper(module.main)
    if not args.array:
        jobs = (executor.submit(runner, comb) for comb in combinations)
    else:
        jobs = executor.map_array(runner, combinations)

    for job, comb in zip(jobs, combinations):
        print(job.job_id, comb)

    print(f"Total jobs launched for total combinations: {len(combinations)}")

    if args.force_requeue:
        time.sleep(60)
        print("sleep over, sending signal")
        for job in jobs:
            print(jobs)
Ejemplo n.º 2
0
    jobs = []

    if not args.no_preemption:
        combinations = [
            comb + ['--preemptable', f'--checkpoint_freq={args.checkpoint_freq}'] for comb in combinations]

    combinations = [comb + [f'--checkpoint_dir={args.checkpoint_dir}'] for comb in combinations]

    if args.tasks > 1:
        # ports should be in between 2**10 and 2**16, but we'll start from some random distant offset and use only a part of the space
        # hopefully we'll never have 2**15 jobs on the same node
        combinations = [comb + [f'--distributed_port={i % (2**15) + 18363}'] for (i, comb) in enumerate(combinations)]

    if args.preview or args.dry_run:
        for comb in combinations:
            runner = SlurmWrapper(module.main)
            print(f'{comb}')
    elif not args.array:
        for comb in combinations:
            runner = SlurmWrapper(module.main)
            job = executor.submit(runner, comb)
            print(f'job id {job.job_id}, args {comb}')
            jobs.append(job)
    else:
        runner = lambda x: SlurmWrapper(module.main)(x)
        jobs = executor.map_array(runner, combinations)

        for job, comb in zip(jobs, combinations):
            print(job.job_id, comb)

    print(f'Total jobs launched: {len(jobs)}, total combinations: {len(combinations)}')