Esempio n. 1
0
def main():

    #  main settings
    # todo: has to be inherited from the sh file:
    sim = 'sim'  # simulation folder name
    database_folder = 'db'
    # todo: this must be inherited
    bh5670 = 'bh5670'  # the outermost folder in the scratch where all other data are put
    # todo yaml or predict
    my_offset = 15  # from the molecule to a vacuum. 15-20 is recommended!
    cutoff = 500
    rel_cutoff = 50
    basis_set_file_name = 'BASIS_CC_AUG_RI_NEW'  # RI5, 2-5 cc, all aug-cc, RIFIT-all
    my_basis_sets = [
        'aug-cc-pVDZ', 'aug-cc-pVTZ', 'aug-cc-pVQZ', 'aug-cc-pV5Z'
    ]
    my_ri_basis_sets = [
        'aug-cc-pVDZ-RIFIT', 'aug-cc-pVTZ-RIFIT', 'aug-cc-pVQZ-RIFIT',
        'aug-cc-pV5Z-RIFIT'
    ]

    debug = True
    #  end: main settings

    dummy_run = False  # does not invoke cp2k if true

    #  parser begin
    parser = argparse.ArgumentParser(description='rank and num of cpus')
    parser.add_argument('-rank')  # array job number
    parser.add_argument(
        '-num_cpus')  # number of cpus you request for every array job
    args = parser.parse_args()
    #  parser end

    #  parsing input
    threads = int(args.num_cpus) - 1  # cpus used to compute
    rank = '{:0>6}'.format(
        args.rank)  # transform rank from '1' to '000001' format
    prefix_xyz_file_name = 'dsgdb9nsd'
    xyz_file_name = f'{prefix_xyz_file_name}_{rank}.xyz'
    xyz_file_location = f'{prefix_xyz_file_name}/{xyz_file_name}'
    sim_folder_scratch = f'/scratch/{bh5670}/{sim}/{rank}'
    sim_folder_home = f'{sim}/{rank}'  # sim folder at home exists. you create later {rank} folder
    if not os.path.exists(sim_folder_scratch):
        os.mkdir(sim_folder_scratch)
    else:
        rmtree(sim_folder_scratch
               )  # leftovers from previous simulations will be removed
        os.mkdir(sim_folder_scratch)  # and the new folder will be created

    # xyz object created, normal xyz file is created at scratch
    my_xyz_file_obj = XYZ.from_file(
        xyz_file_location)  # object created using the file from home
    xyz_at_scratch = sim_folder_scratch + '/' + xyz_file_name  #
    my_xyz_file_obj.write(xyz_at_scratch)  # writes a normal xyz (into scratch)

    # my molecule object is created. It will serve as a DB record
    my_new_mol = Cp2kOutput(rank)

    # rel_cutoff: 40; cutoff: 300; abc = 10
    my_abc = str(my_xyz_file_obj.compute_box_size(offset=my_offset))[1:-2]

    # GLOBAL SETTINGS
    #

    ## base settings ##
    my_potential_file_name = 'POTENTIAL'
    my_potential = 'ALL'
    my_project_name = "this_is_template"
    # my_ri_aux_basis_set = 'RI-5Z'  # often fails
    organic_elements = [
        'H', 'C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'B', 'I'
    ]
    my_elements = organic_elements
    inp_file_name = 'test_2345.inp'
    my_vdw_parameters_file = 'dftd3.dat'
    activate_vdw = False
    activate_outer_scf = False
    wf_corr_num_proc = 1  # 16 in the ref paper; -1 to use all

    ########################################### CREATE TEMPLATE FOR TWO RUNS ###########################################
    calc = CP2K()
    calc.working_directory = './'
    calc.project_name = 'artem_gw_project'
    calc.mpi_n_processes = 1

    # pycp2k objects
    CP2K_INPUT = calc.CP2K_INPUT
    FORCE_EVAL = CP2K_INPUT.FORCE_EVAL_add()
    FORCE_EVAL.Method = 'QUICKSTEP'
    SUBSYS = FORCE_EVAL.SUBSYS
    DFT = FORCE_EVAL.DFT
    XC = DFT.XC
    SCF = DFT.SCF
    OUTER_SCF = DFT.SCF.OUTER_SCF
    ####################################################################################################################

    # GLOBAL #
    # FORCE EVAL #
    set_global(CP2K_INPUT, project_name=my_project_name)

    ## SUBSYS ##
    set_unperiodic_cell(SUBSYS, abc=my_abc)
    set_nonperiodic_poisson(DFT)
    set_topology(SUBSYS, xyz_file_name=xyz_file_name)
    center_coordinates(SUBSYS)
    ## END SUBSYS ##

    ## DFT ##
    set_dft(DFT,
            potential_file_name=my_potential_file_name,
            basis_set_file_name=basis_set_file_name)
    set_cutoff(DFT, cutoff=cutoff, rel_cutoff=rel_cutoff, ngrids=5)
    set_scf(DFT, eps_scf=1.0E-9, max_scf=500, scf_guess='ATOMIC')
    add_ot(SCF, stepsize=0.05)
    #
    # add_outer_scf(OUTER_SCF)
    set_pbe(XC)  # we start with pbe
    # set_pbe0(XC) no pbe0 in the beginning
    set_qs(DFT, eps_default=1.0E-10, eps_pgf_orb=np.sqrt(1.0E-10))

    # print_mo(DFT.PRINT)
    if activate_vdw:
        add_vdw(XC, vdw_parameters_file=my_vdw_parameters_file)
    ## END DFT ##

######################################## END: CREATE TEMPLATE ##########################################################

######################################## BEGIN: RUN CP2K TWO TIMES #####################################################
    suffix = ['2', '3', '4']  # cardinal numbers of the database

    # begin: input_from_yaml
    # cp2k_exe_path = '/home/artem/soft/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    cp2k_exe_path = '/home/ws/bh5670/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    my_run_type = 'mpi'

    for i_bs, suffix in enumerate(suffix):
        # bs

        calc_ = deepcopy(calc)

        #
        CP2K_INPUT_ = calc_.CP2K_INPUT
        FORCE_EVAL_ = CP2K_INPUT_.FORCE_EVAL_list[0]
        SUBSYS_ = FORCE_EVAL_.SUBSYS
        DFT_ = FORCE_EVAL_.DFT
        XC_ = DFT_.XC
        SCF_ = DFT_.SCF
        OUTER_SCF_ = DFT_.SCF.OUTER_SCF
        #

        set_global(CP2K_INPUT_, project_name=suffix)
        add_elements(SUBSYS_,
                     elements=my_elements,
                     basis=my_basis_sets[i_bs],
                     aux_basis=my_ri_basis_sets[i_bs],
                     pot=my_potential)
        # bs
        output_file = f'out_{suffix}.out'
        ot_file_name = 'OT_' + f'{suffix}_' + inp_file_name
        diag_file_name = 'DIAG_' + f'{suffix}_' + inp_file_name
        # end: input

        if i_bs != 0:
            set_scf(DFT_, eps_scf=1.0E-8, max_scf=500, scf_guess='RESTART'
                    )  # TZ,QZ will start from RESTART of the DZ,QZ
            try:
                copy(sim_folder_scratch + '/' + f'{int(suffix)-1}-RESTART.wfn',
                     sim_folder_scratch + '/' + f'{suffix}-RESTART.wfn')
                print('copied restart file 2->3 or 3->4')
            except:
                print('not succesfull copy of the restart file')
        elif i_bs == 0:
            set_scf(DFT_, eps_scf=1.0E-8, max_scf=500,
                    scf_guess='ATOMIC')  # DZ with ATOMIC guess

        # OT run to converge quickly
        calc_.write_input_file(sim_folder_scratch + '/' + ot_file_name)
        # first run
        print(f"Running PBE with OT (basis set = {suffix})...")
        if not dummy_run:
            cp2k_run(input_file=ot_file_name,
                     xyz_file=xyz_file_name,
                     run_type=my_run_type,
                     np=threads,
                     output_file=f'out_ot_{suffix}.out',
                     cp2k_executable=cp2k_exe_path,
                     execution_directory=sim_folder_scratch)
            # end: first run
        print(f"I have finished cp2k with OT (basis set = {suffix})")

        # DIAGONALIZATION RUN to reliably compute H**O and then GW
        # remove the OT method
        remove_ot(SCF_)

        # change calculations to a diagonalization
        add_diagonalization(SCF_)
        # add_smear(SCF_)  # uses final T.
        add_mixing(SCF_)  # add or not?
        add_mos(SCF_, added_mos=1000)
        # plot h**o/lumo

        #set_pbe0(XC_)  # we want G0W0@PBE0. no pbe0 in the beginning
        print_mo_cubes(DFT_.PRINT, nhomo=10,
                       nlumo=10)  # all HOMOs are typicall plotted
        set_scf(DFT_, eps_scf=1E-6, max_scf=200)
        # add G0W0!
        add_gw_ver_0(XC_,
                     ev_sc_iter=1,
                     wf_corr_num_proc=wf_corr_num_proc,
                     rpa_num_quad_points=100,
                     max_memory_wf=4000,
                     max_memory_hf=500,
                     corr_occ=1,
                     corr_virt=1)  # GW!
        # it is important to keep WF memory smaller than HF memory, otherwise, it crashes
        calc_.write_input_file(sim_folder_scratch + '/' + diag_file_name)
        # second run
        print(f"Running G0W0 with DIAG (basis set = {suffix})...")
        my_out_file2 = f'out_diag_{suffix}.out'
        if not dummy_run:
            cp2k_run(input_file=diag_file_name,
                     xyz_file=xyz_file_name,
                     output_file=my_out_file2,
                     run_type=my_run_type,
                     np=threads,
                     cp2k_executable=cp2k_exe_path,
                     execution_directory=sim_folder_scratch)
            print(f"I have finished cp2k with DIAG (basis set = {suffix})")

            # extract h**o/lumo and gw h**o/lumo from the cp2k output file:
            path_to_out2_file = sim_folder_scratch + '/' + my_out_file2

            # extract from the output
            try:
                num_orb = extract_number_of_independent_orbital_function(
                    path_to_out2_file)
                print(
                    f'basis set = {suffix}, number of independent orbital functions: {num_orb}'
                )
            except:
                print('number of orbatals was not extracted')
                num_orb = 'not extracted'

            try:
                homos, lumos = [], []
                homos, lumos = return_homo_lumo(path_to_out2_file)
                print(f'basis set = {suffix} ', 'h**o = ',
                      homos[-1] * eV_to_Hartree(), ' eV')
                print(f'basis set = {suffix} ', 'lumo = ',
                      lumos[0] * eV_to_Hartree(), ' eV')
                h**o = homos[-1] * eV_to_Hartree()
                lumo = lumos[0] * eV_to_Hartree()
            except:
                print(f'H**o/Lumo were not extracted')
                h**o = 'not extracted'
                lumo = 'not extracted'

            try:
                gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(
                    path_to_out2_file)
                if isinstance(h**o, str) and isinstance(lumo, str):
                    h**o = homo_
                    lumo = lumo_
                print(f'basis set = {suffix} ', 'h**o = ', h**o, ' eV')
                print(f'basis set = {suffix} ', 'lumo = ', lumo, ' eV')
                print(f'basis set = {suffix} ', 'gw h**o = ', gw_occ, ' eV')
                print(f'basis set = {suffix} ', 'gw lumo = ', gw_vir, ' eV')
            except:
                print("GW energies were not extracted")
                gw_occ = 'not extracted'
                gw_vir = 'not extracted'

            del calc_

            #  put computed data into the molecule object
            my_new_mol.add_energies(int(suffix), h**o, lumo, gw_occ, gw_vir)
            my_new_mol.add_num_orbitals(int(suffix), num_orb)
            my_new_mol.extrapolate_energy()
            db_record = my_new_mol.yield_dict(
            )  # this dict will be written into yaml. it will be a record in the global library
            #

    print("\nI am done\n")
    print('saving to DB...')

    with open(f'{database_folder}/DB_{rank}.yaml', 'w') as stream:
        yaml.safe_dump(db_record, stream)

    print(f"saved to {database_folder}/DB_{rank}.yaml")
    print('I will remove the content the sim folder')

    # Clean up before leave
    status = my_new_mol.status()
    if status == 'all_extracted':  # all quantities are extracted
        if debug:
            print(
                f'status: {status}, but debug is on ==> will move {sim_folder_scratch} to {sim_folder_home}'
            )
            copytree(sim_folder_scratch,
                     sim_folder_home)  # will rewrite the folder
        else:
            print(f'status: {status} ==> will remove {sim_folder_scratch}')
            try_to_remove_folder(sim_folder_scratch)
    else:
        print(f'status: {status} ==> will copy failed sim folder from scratch')
        #if not os.path.exists(sim_folder_home):
        #os.mkdir(sim_folder_home)   # will overwrite if exists
        copytree(sim_folder_scratch,
                 sim_folder_home)  # will rewrite the folder
        print(f"I have copied {sim_folder_scratch} to {sim_folder_home}")
        try_to_remove_folder(sim_folder_scratch)
Esempio n. 2
0
def main():
    try:
        scratch = os.environ[
            'SCRATCH']  # SCRATCH has to be in the env var dict. Normally, it is.
    except:
        scratch = 'scratch'
    #  parser begin
    parser = argparse.ArgumentParser(description='rank and num of cpus')
    parser.add_argument('-rank')  # array job number
    parser.add_argument(
        '-num_cpus')  # number of cpus you request for every array job
    parser.add_argument('-i')  # input_from_yaml yaml file
    parser.add_argument(
        '-mol_ids'
    )  # mol_ids to simulate (without prefix and suffix) todo: not used?
    args = parser.parse_args()
    #  parser end

    #  yaml file
    yaml_file_name = args.i
    with open(yaml_file_name) as stream:
        input = yaml.load(stream=stream)
    #  end: yaml file

    #  todo: think over because it is imported twice
    #  end: run-or-check settings
    debug = input['debug']
    dummy_run = input['dummy_run']

    #  if not at cluster: test
    # debug = True
    # dummy_run = True
    # end: if not at cluster

    #  folders names
    sim = input['folder_names']['simulations']
    db = input['folder_names']['database']
    bh5670 = input['folder_names'][
        'scratch']  # the outermost folder in the scratch folder where all other data are put
    prefix_xyz_file_name = input['prefix_xyz_file_name']
    my_offset = input['molecule_vacuum_offset']
    try:
        mpi = input['mpi']
    except:
        mpi = 'openmpi'

    #  parsing input_from_yaml
    threads = int(
        args.num_cpus
    )  # cpus used to compute. I do not subtract 1. This does not help
    # mol_id = parse_mixed_list()
    path_to_mol_ids_default = 'db/trash_db_numbers.csv'  # simulate mols that did not fully converged
    # path_to_mol_ids_default = 'db/missing_num.csv'  #  simulate mols that are missing in the range of the simulated mols

    # by default: missing_numbers
    try:
        path_to_mol_ids = args.mol_ids
        if path_to_mol_ids is None:
            raise TypeError
    except TypeError:
        path_to_mol_ids = path_to_mol_ids_default
    finally:
        with open(path_to_mol_ids, 'r') as stream:
            csv_reader = csv.reader(stream)
            all_numbers = csv_reader.__next__(
            )  # only one line in this csv format file, so we do not loop over
            rank = all_numbers[int(args.rank) - 1]

    rank = '{:0>6}'.format(
        rank
    )  # transform rank from '1' to '000001' format. This is not a general thing
    xyz_file_name = f'{prefix_xyz_file_name}_{rank}.xyz'
    xyz_file_location = f'{prefix_xyz_file_name}/{xyz_file_name}'

    db_record_path = f'{db}/DB_{rank}.yaml'  # file where the results will be saved todo: raeum es alles auf!
    #  check is the output exists
    if os.path.exists(db_record_path):
        print(
            f'The simulation results of mol. {rank} is already in the folder of reference'
        )
        exit()
    #  end: check if the output exists

    if not dummy_run:
        sim_folder_scratch = f'/{scratch}/{bh5670}/{sim}/{rank}'
    else:
        sim_folder_scratch = f'{scratch}/{bh5670}/{sim}/{rank}'

    sim_folder_home = f'{sim}/{rank}'  # sim folder at home exists. you create later {rank} folder
    if not os.path.exists(sim_folder_scratch):
        os.mkdir(sim_folder_scratch)
    else:
        rmtree(sim_folder_scratch
               )  # leftovers from previous simulations will be removed
        os.mkdir(sim_folder_scratch)  # and the new folder will be created

    #  xyz object created, normal xyz file is created at scratch
    try:
        my_xyz_file_obj = XYZ.from_file(
            xyz_file_location)  # object created using the file from home
    except:  #  test
        my_xyz_file_obj = XYZ.from_file(
            'H2O.xyz')  # object created using the file from home

    xyz_at_scratch = sim_folder_scratch + '/' + xyz_file_name  #
    my_xyz_file_obj.write(xyz_at_scratch)  # writes a normal xyz (into scratch)

    # my molecule object is created. It will serve as a DB record
    my_new_mol = Cp2kOutput(rank)

    # rel_cutoff: 40; cutoff: 300; abc = 10
    my_abc = str(my_xyz_file_obj.compute_box_size(offset=my_offset))[1:-2]
    input['my_abc'] = my_abc
    input['xyz_file_name'] = xyz_file_name

    # misc
    wf_corr_num_proc = 0  # 16 in the ref paper; -1 to use all
    inp_file_name = 'test_2344.inp'  # base file name

    ######################################## BEGIN: RUN CP2K TWO TIMES #####################################################
    # suffix = ['2', '3', '4']  # cardinal numbers of the database

    # begin: input_from_yaml
    # cp2k_exe_path = '/home/artem/soft/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    #cp2k_exe_path = '/home/ws/bh5670/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    cp2k_exe_path = input['cp2k_exe_path']
    my_run_type = 'mpi'

    suffix = input[
        'basis_set_suffix']  # todo: fix DZ --> 2, TZ --> 3, QZ --> 4

    #  this will initialize class variables (that is the class) according to the input
    #  actually, this is probably a bad idea to make it like that, because if one forgets doing so,
    #  class functions will not work
    InputFactory.set_constants(input_from_yaml=input)

    # --> my_cp2k_run: condensed function with just 2 argument.
    # my_inp_file, my_out_file: return names
    # reason: its other parameters are the same for all 6 (or more runs)
    # this is nothing more than a shorthand, this is why it is ugly
    def my_inp_file(suf, ot_or_diag):
        return f'{ot_or_diag}_{suf}.inp'

    def my_out_file(suf, ot_or_diag):
        return f'out_{ot_or_diag}_{suf}.out'

    def my_cp2k_run(suf='2', ot_or_diag='ot'):
        cp2k_run(input_file=my_inp_file(suf, ot_or_diag),
                 output_file=my_out_file(suf, ot_or_diag),
                 xyz_file=xyz_file_name,
                 run_type=my_run_type,
                 np=threads,
                 cp2k_executable=cp2k_exe_path,
                 execution_directory=sim_folder_scratch,
                 type_mpi=mpi)

    # <-- my_cp2k_run

    # ot_file_names = [f'OT_{suffix}_{inp_file_name}' for suffix in suffix]
    # diag_file_names =[f'DIAG_{suffix}_{inp_file_name}' for suffix in suffix]
    # out_ot_file_names = [f'out_ot_{suffix}.out' for suffix in suffix]
    # out_diag_file_names =[f'out_diag_{suffix}.out' for suffix in suffix]

    print('I am HERE')

    for i_bs, suf in enumerate(suffix):

        # --> OT dft. (OT = orbital transformation)
        dft_ot_simulation = InputFactory.new_dft_ot(i_bs)
        dft_ot_simulation.write_input_file(
            f"{sim_folder_scratch}/{my_inp_file(suf=suf, ot_or_diag='ot')}")
        # OT dft run below ...
        # ... but before, we copy the RESTART from the previous basis set (it exists unless for the smallest basis set)
        try_to_copy_previous_restart_file(
            i_bs=i_bs, sim_folder_scratch=sim_folder_scratch, suf=suf)
        print(f"Running PBE with OT (basis set = {suf})...")
        if not dummy_run:
            my_cp2k_run(suf=suf, ot_or_diag='ot')
        print(f"I have finished cp2k with OT (basis set = {suf})")
        # <-- OT dft

        # --> GW following DIAG dft. (DIAG = diagonalization)
        diag_out_file = f"{sim_folder_scratch}/{my_out_file(suf=suf, ot_or_diag='diag')}"
        diag_inp_file = f"{sim_folder_scratch}/{my_inp_file(suf=suf, ot_or_diag='diag')}"
        gw_diag_simulations = InputFactory.new_gw(i_bs)
        gw_diag_simulations.write_input_file(diag_inp_file)
        print(f"Running G0W0 with DIAG (basis set = {suf})...")
        if not dummy_run:
            my_cp2k_run(suf=suf, ot_or_diag='diag')
            print(f"I have finished cp2k with DIAG (basis set = {suf})")
            # --> extract (from diag out)
            # extract number of orbitals:
            try:
                num_orb = extract_number_of_independent_orbital_function(
                    diag_out_file)
                print(
                    f'basis set = {suf}, number of independent orbital functions: {num_orb}'
                )
            except:
                print('number of orbitals was not extracted')
                num_orb = 'not extracted'
            # extract energies:
            try:
                homos, lumos = [], []
                homos, lumos = return_homo_lumo(diag_out_file)
                print(f'basis set = {suffix} ', 'h**o = ',
                      homos[-1] * eV_to_Hartree(), ' eV')
                print(f'basis set = {suffix} ', 'lumo = ',
                      lumos[0] * eV_to_Hartree(), ' eV')
                h**o = homos[-1] * eV_to_Hartree()
                lumo = lumos[0] * eV_to_Hartree()
            except:
                print(f'H**o/Lumo were not extracted')
                h**o = 'not extracted'
                lumo = 'not extracted'

            try:
                gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(
                    diag_out_file)
                h**o, lumo = redefine_homo_lumo_if_not_extracted_before(
                    homo_, lumo_, h**o, lumo)
                print_extracted_energies(suf, h**o, lumo, gw_occ,
                                         gw_vir)  # on a screen
            except SCQPSolutionNotFound:  # we know how to handle this error
                print(
                    "GW is not extracted, because SCQPSolutionNotFound. Calling fallback ..."
                )
                # --> of the solution not found, it could be that the number of quad points is insufficent
                gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[
                    0].DFT.XC.WF_CORRELATION_list[
                        0].RI_RPA.Rpa_num_quad_points = 500
                print(
                    "I write the fallback input file where num of q points = 500. It has the same name as before?"
                )
                gw_diag_simulations.write_input_file(diag_inp_file)
                my_cp2k_run(suf=suf, ot_or_diag='diag')
            except SCFNotConvergedNotPossibleToRunMP2:
                print(
                    "GW is not extracted, because SCFNotConvergedNotPossibleToRunMP2. Calling fallback ..."
                )
                print('NOT IMPLEMENTED')
            finally:
                try:
                    gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(
                        diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(
                        homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suffix, h**o, lumo, gw_occ,
                                             gw_vir)  # on a screen
                # <---
                except:
                    print(
                        "GW energies were not extracted even in the fallback")
                    gw_occ = 'not extracted'
                    gw_vir = 'not extracted'

            del dft_ot_simulation, gw_diag_simulations

            #  put computed data into the molecule object
            my_new_mol.add_energies(int(suf), h**o, lumo, gw_occ, gw_vir)
            my_new_mol.add_num_orbitals(int(suf), num_orb)
            my_new_mol.extrapolate_energy()  # level up?
            db_record = my_new_mol.yield_dict(
            )  # this dict will be written into yaml. it will be a record in the global library
        # <-- EMD: GW run and extraction
    ####################################### END: RUN CP2K TWO TIMES #####################################################
    print("\nI am done\n")

    if not dummy_run:
        print('saving to DB...')

        with open(f'{db}/DB_{rank}.yaml', 'w') as stream:
            yaml.safe_dump(db_record, stream)

        print(f"saved to {db}/DB_{rank}.yaml")

    print('I will remove the content of the sim folder')
    # Clean up before leave
    status = my_new_mol.status()
    if status == 'all_extracted':  # all quantities are extracted
        if debug:
            print(
                f'status: {status}, but debug is on ==> will move {sim_folder_scratch} to {sim_folder_home}'
            )
            copytree(sim_folder_scratch, sim_folder_home,
                     dirs_exist_ok=True)  # will rewrite the folder
        else:
            print(f'status: {status} ==> will remove {sim_folder_scratch}')
            try_to_remove_folder(sim_folder_scratch)
    else:
        print(f'status: {status} ==> will copy failed sim folder from scratch')
        #if not os.path.exists(sim_folder_home):
        #os.mkdir(sim_folder_home)   # will overwrite if exists
        try:
            copytree(sim_folder_scratch,
                     sim_folder_home)  # will rewrite the folder?
            print(f"I have copied {sim_folder_scratch} to {sim_folder_home}")
        except:
            print(
                f"I could not copy {sim_folder_scratch} to {sim_folder_home}")
        try_to_remove_folder(sim_folder_scratch)
Esempio n. 3
0
def main():
    def my_out_file(suf, ot_or_diag):
        return f'out_{ot_or_diag}_{suf}.out'

    SUFFIX = ['2', '3']

    my_path = os.path.abspath('')

    print(my_path)

    all_folders = os.listdir('sim')

    print(f'all folders of interest = {all_folders}')

    path_to_folders = [
        os.path.abspath('sim' + '/' + folder) for folder in all_folders
    ]

    print(path_to_folders)

    for folder, path in zip(all_folders, path_to_folders):

        db_record_path = f'db/DB_{folder}.yaml'

        if os.path.exists(db_record_path):
            print(
                f'The simulation results of mol. {folder} is already in the folder of reference'
            )
            exit()

        print(f'\nTHIS IS MOL_NUM {folder}\n')

        my_new_mol = Cp2kOutput(folder)

        for suf in SUFFIX:
            diag_out_file = f"sim/{folder}/{my_out_file(suf=suf, ot_or_diag='diag')}"

            #

            try:
                num_orb = extract_number_of_independent_orbital_function(
                    diag_out_file)
                print(
                    f'basis set = {suf}, number of independent orbital functions: {num_orb}'
                )
            except:
                print('number of orbitals was not extracted')
                num_orb = 'not extracted'
            # extract energies:
            try:
                homos, lumos = [], []
                homos, lumos = return_homo_lumo(diag_out_file)
                print(f'basis set = {suf} ', 'h**o = ',
                      homos[-1] * eV_to_Hartree(), ' eV')
                print(f'basis set = {suf} ', 'lumo = ',
                      lumos[0] * eV_to_Hartree(), ' eV')
                h**o = homos[-1] * eV_to_Hartree()
                lumo = lumos[0] * eV_to_Hartree()
            except:
                print(f'H**o/Lumo were not extracted')
                h**o = 'not extracted'
                lumo = 'not extracted'

            try:
                gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(
                    diag_out_file)
                h**o, lumo = redefine_homo_lumo_if_not_extracted_before(
                    homo_, lumo_, h**o, lumo)
                print_extracted_energies(suf, h**o, lumo, gw_occ,
                                         gw_vir)  # on a screen
            except SCFNotConvergedNotPossibleToRunMP2:
                print(
                    "GW is not extracted, because SCFNotConvergedNotPossibleToRunMP2. Calling fallback ..."
                )
                print('NOT IMPLEMENTED')
            finally:
                try:
                    gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(
                        diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(
                        homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suf, h**o, lumo, gw_occ,
                                             gw_vir)  # on a screen
                # <---
                except:
                    print(
                        "GW energies were not extracted even in the fallback")
                    gw_occ = 'not extracted'
                    gw_vir = 'not extracted'

            #  put computed data into the molecule object
            my_new_mol.add_energies(int(suf), h**o, lumo, gw_occ, gw_vir)
            my_new_mol.add_num_orbitals(int(suf), num_orb)
        my_new_mol.extrapolate_energy()  # level up?
        db_record = my_new_mol.yield_dict(
        )  # this dict will be written into yaml. it will be a record in the global library
        # <-- EMD: GW run and extraction

        print('saving to DB...')

        with open(f'db/DB_{folder}.yaml', 'w') as stream:
            yaml.safe_dump(db_record, stream)

        print(f"saved to db/DB_{folder}.yaml")

        #

        print('done')
Esempio n. 4
0
def main():
    try:
        scratch = os.environ['SCRATCH']  # SCRATCH has to be in the env var dict. Normally, it is.
    except:
        scratch = 'scratch'
    #  parser begin
    parser = argparse.ArgumentParser(description='rank and num of cpus')
    parser.add_argument('-rank')  # array job number
    parser.add_argument('-num_cpus')  # number of cpus you request for every array job
    parser.add_argument('-i')  # input_from_yaml yaml file
    parser.add_argument('-mol_ids')  # path to csv file with some identifiers of molecules to be simulated.
    # Identifiers may be: numbers of the molecules (6 digits) or the full names of the molecules mol_ids to simulate
    args = parser.parse_args()
    #  parser end

    #  yaml file. my_input is the dictionaty containing input information.
    yaml_file_name = args.i
    with open(yaml_file_name) as stream:
        my_input = yaml.load(stream=stream, Loader=yaml.SafeLoader)
    #  end: yaml file

    #  todo: think over because it is imported twice
    #  end: run-or-check settings
    debug = my_input['debug']  # if True will copy the content of the scratch folder back to sim/* folder. Actually, should be used
    dummy_run = my_input['dummy_run']

    #  if not at cluster: test
    # debug = True
    # dummy_run = True
    # end: if not at cluster

    #  folders names
    sim = my_input['folder_names']['simulations']
    db = my_input['folder_names']['database']
    bh5670 = my_input['folder_names']['scratch']  # the outermost folder in the scratch folder where all other data are put
    my_offset = my_input['molecule_vacuum_offset']
    try:
        mpi = my_input['mpi']
    except:
        mpi = 'openmpi'

    #  parsing input_from_yaml
    threads = int(args.num_cpus)  # cpus used to compute. I do not subtract 1. This does not help
    # mol_id = parse_mixed_list()

    def determine_file_name_path(my_input, args):
        """
        Depending on the format of the input dictionary (my_input['db_format']),
         returns the name of the input file and the path to it
        @param my_input: input dictionary, which was read in from the yaml input file
        @param args: command line arguments list
        @return: xyz_file_name, xyz_file_path, mol_identifier (self-explained)
        """

        try:
            db_format = my_input['db_format']  # 'general' means that names are full mols identifiers
        except KeyError:
            db_format = 'dsgdb9nsd'  # this is the format of the corresponding dataset only. default

        try:
            path_to_mol_ids = args.mol_ids
        except TypeError:
            print(f'path_to_mol_ids {args.mol_ids} is not found. Exiting...')
            exit()

        with open(path_to_mol_ids, 'r') as stream:
            csv_reader = csv.reader(stream)
            all_mols_ids = csv_reader.__next__()  # only one line in this csv format file, so we do not loop over

        if db_format =='dsgdb9nsd':
            prefix_xyz_file_name = my_input['prefix_xyz_file_name']
            mol_identifier_variable_digits = all_mols_ids[int(args.rank) - 1]  # the variable "rank" is not actually a rank. This is here 6 digits read from *.csv file (only for db_format = 'gdb...' or if it is nnot specified).
            mol_identifier_6_digits = '{:0>6}'.format(mol_identifier_variable_digits)  # transform rank from '1' to '000001' format. This is not a general thing
            xyz_file_name = f'{prefix_xyz_file_name}_{mol_identifier_6_digits}.xyz'  # only file name
            xyz_file_path = f'../{prefix_xyz_file_name}/{xyz_file_name}'  # path to the file. db outside working folder
            return xyz_file_name, xyz_file_path, mol_identifier_6_digits

        elif db_format == 'general':
            try:
                prefix_xyz_file_name = my_input['prefix_xyz_file_name']  # here, xyz file name is the name of the db folder.
            except KeyError:
                prefix_xyz_file_name = my_input['dataset_name']
            mol_identifier = all_mols_ids[int(args.rank) - 1].split('.')[0]
            xyz_file_name = f'{mol_identifier}.xyz'  # only file name
            xyz_file_path = f'../{prefix_xyz_file_name}/{xyz_file_name}'  # path to the file. db outside working folder
            return xyz_file_name, xyz_file_path, mol_identifier
        else:
            print('Unknown db_format. Exiting...')
            exit()

    xyz_file_name, xyz_file_path, mol_identifier = determine_file_name_path(my_input, args)

    # db -->
    # check is the output exists in 'db' folder
    db_record_path = f'{db}/DB_{mol_identifier}.yaml'  # file where the results will be saved
    if os.path.exists(db_record_path):
        print(f'The simulation results of mol. {mol_identifier} is already in the folder of reference')
        exit()
    #  here one can check if the DB_ file is not broken
    #  end: check if the output exists
    # <-- db
    
    # scratch -->
    # this below makes something. look carefully!
    if not dummy_run:
        print('This a productive run')
        sim_folder_scratch = f'{scratch}/{bh5670}/{sim}/{mol_identifier}'
        print(f'I set a sim_folder_scratch to: {sim_folder_scratch}')
    else:
        print('This is dummy run')
        sim_folder_scratch = f'{scratch}/{bh5670}/{sim}/{mol_identifier}'
        print(f'I set a sim_folder_scratch to: {sim_folder_scratch}')
    os.makedirs(sim_folder_scratch, exist_ok=True)
    #<-- scratch

    # sim -->
    sim_folder_home = f'{sim}/{mol_identifier}'  # sim folder at home exists (has to exist beforehand). you create later {mol_id} folder
    if not os.path.exists(sim_folder_home):  # home
        os.mkdir(sim_folder_home)
    else:
        print(f"I have found the folder {sim_folder_home} in the sim folder and will try to copy it to scratch ....")
        copytree(sim_folder_home, sim_folder_scratch, dirs_exist_ok=True)  # will rewrite the folder
        print('...done!')
        print(f'now I will remove the {sim_folder_home} folder at home and create a new empty folder at its place...')
        rmtree(sim_folder_home)  # leftovers from previous simulations will be removed
        os.mkdir(sim_folder_home)  # and the new folder will be created
        print('...done')
    # <-- sim
    
    #  xyz object created, normal xyz file is created at scratch
    # todo: H20.xyz is not relevant anymore
    try:
        my_xyz_file_obj = XYZ.from_file(xyz_file_path)  # object created using the file from home
    except FileNotFoundError:  # test
        my_xyz_file_obj = XYZ.from_file('H2O.xyz')  # object created using the file from home

    xyz_at_scratch = sim_folder_scratch + '/' + xyz_file_name  #
    my_xyz_file_obj.write(xyz_at_scratch)  # writes a normal xyz (into scratch)

    # my molecule object is created. It will serve as a DB record
    my_new_mol = Cp2kOutput(mol_identifier)

    # rel_cutoff: 40; cutoff: 300; abc = 10
    my_abc = str(my_xyz_file_obj.compute_box_size(offset=my_offset))[1:-2]

    # last changes to my_input: offsets and xyz_file_name
    my_input['my_abc'] = my_abc
    my_input['xyz_file_name'] = xyz_file_name
    my_xyz_file_obj.identify_atom_types()
    my_input['elements'] = my_xyz_file_obj.unique_atom_types  # elements identified automatically
    # misc
    wf_corr_num_proc = 0  # 16 in the ref paper; -1 to use all
    inp_file_name = 'test_2344.inp'  # base file name

    ######################################## BEGIN: RUN CP2K TWO TIMES #####################################################
    # suffix = ['2', '3', '4']  # cardinal numbers of the database

    # begin: input_from_yaml
    # cp2k_exe_path = '/home/artem/soft/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    # cp2k_exe_path = '/home/ws/bh5670/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    cp2k_exe_path = my_input['cp2k_exe_path']
    my_run_type = 'mpi'

    suffix = my_input['basis_set_suffix']  # todo: fix DZ --> 2, TZ --> 3, QZ --> 4

    #  this will initialize class variables (that is the class) according to the input
    #  actually, this is probably a bad idea to make it like that, because if one forgets doing so,
    #  class functions will not work
    InputFactory.set_constants(input_from_yaml=my_input)

    # --> my_cp2k_run: condensed function with just 2 argument.
    # my_inp_file, my_out_file: return names
    # reason: its other parameters are the same for all 6 (or more runs)
    # this is nothing more than a shorthand, this is why it is ugly
    def my_inp_file(suf, ot_or_diag):
        return f'{ot_or_diag}_{suf}.inp'

    def my_out_file(suf, ot_or_diag):
        return f'out_{ot_or_diag}_{suf}.out'

    def my_cp2k_run(suf='2', ot_or_diag='ot'):
        cp2k_run(input_file=my_inp_file(suf, ot_or_diag),
                 output_file=my_out_file(suf, ot_or_diag),
                 xyz_file=xyz_file_name,
                 run_type=my_run_type,
                 np=threads,
                 cp2k_executable=cp2k_exe_path,
                 execution_directory=sim_folder_scratch,
                 type_mpi=mpi)

    # <-- my_cp2k_run

    # ot_file_names = [f'OT_{suffix}_{inp_file_name}' for suffix in suffix]
    # diag_file_names =[f'DIAG_{suffix}_{inp_file_name}' for suffix in suffix]
    # out_ot_file_names = [f'out_ot_{suffix}.out' for suffix in suffix]
    # out_diag_file_names =[f'out_diag_{suffix}.out' for suffix in suffix]

    print('I am HERE')

    for i_bs, suf in enumerate(suffix):

        # --> OT dft. (OT = orbital transformation)
        dft_ot_simulation = InputFactory.new_dft_ot(i_bs)
        ot_inp_file = f"{sim_folder_scratch}/{my_inp_file(suf=suf, ot_or_diag='ot')}"
        dft_ot_simulation.write_input_file(ot_inp_file)
        # OT dft run below ...
        # ... but before, we copy the RESTART from the previous basis set (it exists unless for the smallest basis set)
        try_to_copy_previous_restart_file(i_bs=i_bs, sim_folder_scratch=sim_folder_scratch, suf=suf)
        print(f"Running PBE with OT (basis set = {suf})...")
        if not dummy_run:
            my_cp2k_run(suf=suf, ot_or_diag='ot')
        print(f"I have finished cp2k with OT (basis set = {suf})")
        # <-- OT dft

        # --> GW following DIAG dft. (DIAG = diagonalization)
        diag_out_file = f"{sim_folder_scratch}/{my_out_file(suf=suf, ot_or_diag='diag')}"
        diag_inp_file = f"{sim_folder_scratch}/{my_inp_file(suf=suf, ot_or_diag='diag')}"
        gw_diag_simulations = InputFactory.new_gw(i_bs)
        gw_diag_simulations.write_input_file(diag_inp_file)
        print(f"Running G0W0 with DIAG (basis set = {suf})...")
        if not dummy_run:
            my_cp2k_run(suf=suf, ot_or_diag='diag')
            print(f"I have finished cp2k with DIAG (basis set = {suf})")
            # --> extract (from diag out)
            # extract number of orbitals:
            try:
                num_orb = extract_number_of_independent_orbital_function(diag_out_file)
                print(f'basis set = {suf}, number of independent orbital functions: {num_orb}')
            except:
                print('number of orbitals was not extracted')
                num_orb = 'not extracted'
            # extract energies:
            try:
                homos, lumos = [], []
                homos, lumos = return_homo_lumo(diag_out_file)
                print(f'basis set = {suffix} ', 'h**o = ', homos[-1] * eV_to_Hartree(), ' eV')
                print(f'basis set = {suffix} ', 'lumo = ', lumos[0] * eV_to_Hartree(), ' eV')
                h**o = homos[-1] * eV_to_Hartree()
                lumo = lumos[0] * eV_to_Hartree()
            except:
                print(f'H**o/Lumo were not extracted')
                h**o = 'not extracted'
                lumo = 'not extracted'
            try:  # first try to return gw energies -->
                occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
            except (IterationLimit, LargeSigc, SCQPSolutionNotFound, NaNInGW):  # 20 iterations
                try:  # xyz + 10
                    print("GW is extracted, but scf is not converged, because of IterationLimit. Calling fallback ...")
                    my_abc_plus_10 = str(my_xyz_file_obj.compute_box_size(offset=my_offset+10.0))[1:-2]  # todo: hard
                    # replay ot with a larger xyz space +10
                    # ot
                    dft_ot_simulation.CP2K_INPUT.FORCE_EVAL_list[0].SUBSYS.CELL.Abc = my_abc_plus_10
                    dft_ot_simulation.write_input_file(ot_inp_file)
                    print("Replay ot with xyz + 10")
                    my_cp2k_run(suf=suf, ot_or_diag='ot')
                    print("... ot succesfull")
                    # diag
                    print("diag Replay ot with xyz + 10 and Femi offset of 5E-2")
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].SUBSYS.CELL.Abc = my_abc_plus_10
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Fermi_level_offset = 4.0E-2
                    gw_diag_simulations.write_input_file(diag_inp_file)
                    my_cp2k_run(suf=suf, ot_or_diag='diag')
                    print("... diag succesfull")
                    # the following section is necessary to catch the error:
                    occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
                except (IterationLimit, LargeSigc, SCQPSolutionNotFound, NaNInGW):
                    try:
                        print("GW is extracted, but scf is not converged AGAIN, because of IterationLimit. Calling fallback ...")
                        # diag 200 Q points
                        gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.Rpa_num_quad_points = 200  # this should help as well
                        gw_diag_simulations.write_input_file(diag_inp_file)
                        my_cp2k_run(suf=suf, ot_or_diag='diag')
                        print("I write the fallback input file with QUAD points = 200")
                        gw_diag_simulations.write_input_file(diag_inp_file)
                        my_cp2k_run(suf=suf, ot_or_diag='diag')
                        print("... diag succesful")
                        # the following section is necessary to catch the error:
                        occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                        h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                        print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
                    except (IterationLimit, LargeSigc, SCQPSolutionNotFound):
                        # replay ot with a larger cutoff then make diag with a larger cutoff
                        # ot
                        print("GW is extracted, but scf is not converged AGAIN AGAIN, because of IterationLimit. Calling fallback ...")
                        print("Replay diag with FERMI offset 10E-2!")
                        gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Fermi_level_offset = 10.0E-2
                        gw_diag_simulations.write_input_file(diag_inp_file)
                        my_cp2k_run(suf=suf, ot_or_diag='diag')
                        print("... diag succesful")
            except SCQPSolutionNotFound:  # we know how to handle this error
                try:
                    print("GW is not extracted, because SCQPSolutionNotFound. Calling fallback ...")
                    # --> of the solution not found, it could be that the number of quad points is insufficent
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Crossing_search = 'BISECTION'  # this alone does not always work
                    print("I write the fallback input file the crossing search is set to BISECTION")
                    gw_diag_simulations.write_input_file(diag_inp_file)
                    my_cp2k_run(suf=suf, ot_or_diag='diag')
                    # the following section is necessary to catch the error:
                    occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
                except SCQPSolutionNotFound:
                    print("GW is not extracted, because SCQPSolutionNotFound. Calling second fallback ...")
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.Rpa_num_quad_points = 500  # this should help as well
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Crossing_search = 'BISECTION'
                    print("I write the fallback input file with QUAD points = 500")
                    gw_diag_simulations.write_input_file(diag_inp_file)
                    my_cp2k_run(suf=suf, ot_or_diag='diag')
            except SCFNotConvergedNotPossibleToRunMP2:
                print("GW is not extracted, because SCFNotConvergedNotPossibleToRunMP2. Calling fallback ...")
                # replay ot with a larger cutoff then make diag with a larger cutoff
                # ot
                dft_ot_simulation.CP2K_INPUT.FORCE_EVAL_list[0].DFT.MGRID.Cutoff = 1000
                dft_ot_simulation.CP2K_INPUT.FORCE_EVAL_list[0].DFT.MGRID.Rel_cutoff = 100
                dft_ot_simulation.write_input_file(ot_inp_file)
                print("Replay ot with cutoff of 100 rel_cutoff of 100...")
                my_cp2k_run(suf=suf, ot_or_diag='ot')
                print("... ot succesfull")
                # diag
                gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.MGRID.Cutoff = 1000
                gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.MGRID.Rel_cutoff = 100
                gw_diag_simulations.write_input_file(diag_inp_file)
                my_cp2k_run(suf=suf, ot_or_diag='diag')
                # print('NOT IMPLEMENTED')
            except NaNInGW:
                try:
                    print("GW is not extracted, because there is a NaN in the last frame of the SCF loop. Calling fallback")
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Crossing_search = 'BISECTION'
                    print("I wrote the fallback. The crossing search is set to BISECTION")
                    gw_diag_simulations.write_input_file(diag_inp_file)
                    my_cp2k_run(suf=suf, ot_or_diag='diag')
                    # print("NOT IMPLEMENTED")
                    occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
                except (NaNInGW, SCQPSolutionNotFound):
                    print("GW is not extracted, because NaNInGW AGAIN. Calling second fallback (BISECTION and num_quad_points = 500) ...")
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.Rpa_num_quad_points = 500  # this should help as well
                    gw_diag_simulations.CP2K_INPUT.FORCE_EVAL_list[0].DFT.XC.WF_CORRELATION_list[0].RI_RPA.RI_G0W0.Crossing_search = 'BISECTION'
                    print("I wrote the fallback input file with QUAD points = 500")
                    gw_diag_simulations.write_input_file(diag_inp_file)
                    my_cp2k_run(suf=suf, ot_or_diag='diag')
            finally:
                try:
                    occ, vir, homo_, lumo_, occ_scf, vir_scf, occ_0, vir_0 = return_gw_energies_advanced(diag_out_file)
                    h**o, lumo = redefine_homo_lumo_if_not_extracted_before(homo_, lumo_, h**o, lumo)
                    print_extracted_energies(suf, h**o, lumo, occ, vir)  # on a screen
                # <---
                except:
                    print("GW energies were not extracted even in the fallback")
                    # occ = 'not extracted'
                    # vir = 'not extracted'
                    occ, vir, occ_scf, vir_scf, occ_0, vir_0 = ['not extracted']*6
            del dft_ot_simulation, gw_diag_simulations

            #  put computed data into the molecule object
            my_new_mol.add_energies_advanced(int(suf), h**o, lumo, occ, vir, occ_0, vir_0, occ_scf, vir_scf)
            my_new_mol.add_num_orbitals(int(suf), num_orb)
            my_new_mol.extrapolate_energy_advanced()  # level up?
            db_record = my_new_mol.yield_dict()  # this dict will be written into yaml. it will be a record in the global library
        # <-- EMD: GW run and extraction
    ####################################### END: RUN CP2K TWO TIMES #####################################################
    print("\nI am done\n")

    if not dummy_run:
        print('saving to DB...')

        with open(f'{db}/DB_{mol_identifier}.yaml', 'w') as stream:
            yaml.safe_dump(db_record, stream)

        print(f"saved to {db}/DB_{mol_identifier}.yaml")

    print('I will remove the content of the sim folder')
    # Clean up before leave
    status = my_new_mol.status()
    if status == 'all_extracted':  # all quantities are extracted
        if debug:
            print(f'status: {status}, but debug is on ==> will move {sim_folder_scratch} to {sim_folder_home}')
            copytree(sim_folder_scratch, sim_folder_home, dirs_exist_ok=True)  # will rewrite the folder
        else:
            print(f'status: {status} ==> will remove {sim_folder_scratch}')
            try_to_remove_folder(sim_folder_scratch)
    else:
        print(f'status: {status} ==> will copy failed sim folder from scratch')
        # if not os.path.exists(sim_folder_home):
        # os.mkdir(sim_folder_home)   # will overwrite if exists
        try:
            copytree(sim_folder_scratch, sim_folder_home, dirs_exist_ok=True)  # will rewrite the folder > 3.8 needed
            print(f"I have copied {sim_folder_scratch} to {sim_folder_home}")
        except:
            print(f"I could not copy {sim_folder_scratch} to {sim_folder_home}")
        try_to_remove_folder(sim_folder_scratch)
def main():
    scratch = os.environ['SCRATCH']  # SCRATCH has to be in the env var dict. Normally, it is.
    #  parser begin
    parser = argparse.ArgumentParser(description='rank and num of cpus')
    parser.add_argument('-rank')  # array job number
    parser.add_argument('-num_cpus')  # number of cpus you request for every array job
    parser.add_argument('-i')  # input_from_yaml yaml file
    args = parser.parse_args()
    #  parser end

    #  yaml file
    yaml_file_name = args.i
    with open(yaml_file_name) as stream:
        input = yaml.load(stream=stream)
    #  end: yaml file

    #  todo: think over because it is imported twice
    #  end: run-or-check settings
    debug = input['debug']
    dummy_run = input['dummy_run']

    #  if not at cluster: test
    # debug = True
    # dummy_run = True
    # end: if not at cluster

    #  folders names
    sim = input['folder_names']['simulations']
    db = input['folder_names']['database']
    bh5670 = input['folder_names'][
        'scratch']  # the outermost folder in the scratch folder where all other data are put
    prefix_xyz_file_name = input['prefix_xyz_file_name']
    my_offset = input['molecule_vacuum_offset']
    try:
        type_mpi = input['mpi']
    except:
        type_mpi = 'openmpi'

    #  parsing input_from_yaml
    threads = int(args.num_cpus)  # cpus used to compute. I do not subtract 1. This does not help
    rank = '{:0>6}'.format(args.rank)  # transform rank from '1' to '000001' format. This is not a general thing
    xyz_file_name = f'{prefix_xyz_file_name}_{rank}.xyz'
    xyz_file_location = f'{prefix_xyz_file_name}/{xyz_file_name}'

    db_record_path = f'{db}/DB_{rank}.yaml'  # file where the results will be saved todo: raeum es alles auf!
    #  check is the output exists
    if os.path.exists(db_record_path):
        print(f'The simulation results of mol. {rank} is already in the folder of reference')
        exit()
    #  end: check if the output exists

    if True: #not dummy_run:
        sim_folder_scratch = f'{scratch}/{bh5670}/{sim}/{rank}'
    else:
        sim_folder_scratch = f'scratch/{bh5670}/{sim}/{rank}'

    sim_folder_home = f'{sim}/{rank}'  # sim folder at home exists. you create later {rank} folder
    if not os.path.exists(sim_folder_scratch):
        os.mkdir(sim_folder_scratch)
    else:
        rmtree(sim_folder_scratch)  # leftovers from previous simulations will be removed
        os.mkdir(sim_folder_scratch)  # and the new folder will be created


    #  xyz object created, normal xyz file is created at scratch
    try:
        my_xyz_file_obj = XYZ.from_file(xyz_file_location)  # object created using the file from home
    except:  #  test
        my_xyz_file_obj = XYZ.from_file('H2O.xyz')  # object created using the file from home

    xyz_at_scratch = sim_folder_scratch + '/' + xyz_file_name  #
    my_xyz_file_obj.write(xyz_at_scratch)  # writes a normal xyz (into scratch)

    # my molecule object is created. It will serve as a DB record
    my_new_mol = Cp2kOutput(rank)

    # rel_cutoff: 40; cutoff: 300; abc = 10
    my_abc = str(my_xyz_file_obj.compute_box_size(offset=my_offset))[1:-2]
    input['my_abc'] = my_abc
    input['xyz_file_name'] = xyz_file_name

    # misc
    wf_corr_num_proc = 0  # 16 in the ref paper; -1 to use all
    inp_file_name = 'test_2344.inp'  # base file name

######################################## BEGIN: RUN CP2K TWO TIMES #####################################################
    # suffix = ['2', '3', '4']  # cardinal numbers of the database

    # begin: input_from_yaml
    # cp2k_exe_path = '/home/artem/soft/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    #cp2k_exe_path = '/home/ws/bh5670/cp2k/cp2k-7.1/exe/local/cp2k.popt'
    cp2k_exe_path = input['cp2k_exe_path']
    my_run_type = 'mpi'

    suffix = input['basis_set_suffix']  # todo: fix DZ --> 2, TZ --> 3, QZ --> 4

    InputFactory.set_constants(input_from_yaml=input)

    for i_bs, suffix in enumerate(suffix):

        # start: ot dft
        # I/O
        output_file = f'out_{suffix}.out'
        ot_file_name = 'OT_' + f'{suffix}_' + inp_file_name  # for DFT (OT)
        diag_file_name = 'DIAG_' + f'{suffix}_' + inp_file_name  # for GW (DIAG)
        # end: I/O

        # OT dft simulation to converge quickly: create the simulation object
        dft_ot_simulation = InputFactory.new_dft_ot(i_bs)
        # OT dft: write input file
        dft_ot_simulation.write_input_file(sim_folder_scratch + '/' + ot_file_name)
        # OT dft run below ...

        # ... but before, we copy the RESTART from the previous basis set (it exists unless for the smallest basis set)
        if i_bs != 0:
            try:
                copy(sim_folder_scratch + '/' + f'{int(suffix) - 1}-RESTART.wfn',
                     sim_folder_scratch + '/' + f'{suffix}-RESTART.wfn')
                print('copied restart file 2->3 or 3->4')
            except:
                print('not succesfull copy of the restart file')
        elif i_bs == 0:
            pass
        #

        print(f"Running PBE with OT (basis set = {suffix})...")
        if not dummy_run:
            cp2k_run(input_file=ot_file_name,
                     xyz_file=xyz_file_name,
                     run_type=my_run_type,
                     np=threads,
                     output_file=f'out_ot_{suffix}.out',
                     cp2k_executable=cp2k_exe_path,
                     execution_directory=sim_folder_scratch,
                     type_mpi=type_mpi)
            # end: first run
        print(f"I have finished cp2k with OT (basis set = {suffix})")

        # DIAGONALIZATION RUN to reliably compute H**O and then GW

        # gw: create the simulation object
        gw_diag_simulations = InputFactory.new_gw(i_bs)
        # gw: write the input file
        gw_diag_simulations.write_input_file(sim_folder_scratch + '/' + diag_file_name)
        # gw run
        print(f"Running G0W0 with DIAG (basis set = {suffix})...")
        my_out_file2 = f'out_diag_{suffix}.out'
        if not dummy_run:
            cp2k_run(input_file=diag_file_name,
                     xyz_file=xyz_file_name,
                     output_file=my_out_file2,
                     run_type=my_run_type,
                     np=threads,
                     cp2k_executable=cp2k_exe_path,
                     execution_directory=sim_folder_scratch,
                     type_mpi=type_mpi)
            print(f"I have finished cp2k with DIAG (basis set = {suffix})")

            # extract h**o/lumo and gw h**o/lumo from the cp2k output file:
            path_to_out2_file = sim_folder_scratch + '/' + my_out_file2

            # the method to extract?
            # extract from the output
            try:
                num_orb = extract_number_of_independent_orbital_function(path_to_out2_file)
                print(f'basis set = {suffix}, number of independent orbital functions: {num_orb}')
            except:
                print('number of orbitals was not extracted')
                num_orb = 'not extracted'

            try:
                homos, lumos = [], []
                homos, lumos = return_homo_lumo(path_to_out2_file)
                print(f'basis set = {suffix} ', 'h**o = ', homos[-1]*eV_to_Hartree(), ' eV')
                print(f'basis set = {suffix} ', 'lumo = ', lumos[0]*eV_to_Hartree(), ' eV')
                h**o = homos[-1]*eV_to_Hartree()
                lumo = lumos[0]*eV_to_Hartree()
            except:
                print(f'H**o/Lumo were not extracted')
                h**o = 'not extracted'
                lumo = 'not extracted'

            try:
                gw_occ, gw_vir, homo_, lumo_ = return_gw_energies(path_to_out2_file)
                if isinstance(h**o, str) and isinstance(lumo, str):
                    h**o = homo_
                    lumo = lumo_
                print(f'basis set = {suffix} ', 'h**o = ', h**o, ' eV')
                print(f'basis set = {suffix} ', 'lumo = ', lumo, ' eV')
                print(f'basis set = {suffix} ', 'gw h**o = ', gw_occ, ' eV')
                print(f'basis set = {suffix} ', 'gw lumo = ', gw_vir, ' eV')
            except:
                print("GW energies were not extracted")
                gw_occ = 'not extracted'
                gw_vir = 'not extracted'

            del dft_ot_simulation, gw_diag_simulations

            #  put computed data into the molecule object
            my_new_mol.add_energies(int(suffix), h**o, lumo, gw_occ, gw_vir)
            my_new_mol.add_num_orbitals(int(suffix), num_orb)
            my_new_mol.extrapolate_energy()  # level up?
            db_record = my_new_mol.yield_dict()  # this dict will be written into yaml. it will be a record in the global library
            #

    ######################################## END: RUN CP2K TWO TIMES #######################################################
    print("\nI am done\n")

    if not dummy_run:
        print('saving to DB...')

        with open(f'{db}/DB_{rank}.yaml', 'w') as stream:
            yaml.safe_dump(db_record, stream)

        print(f"saved to {db}/DB_{rank}.yaml")

    print('I will remove the content of the sim folder')
    # Clean up before leave
    status = my_new_mol.status()
    if status == 'all_extracted':  # all quantities are extracted
        if debug:
            print(f'status: {status}, but debug is on ==> will move {sim_folder_scratch} to {sim_folder_home}')
            copytree(sim_folder_scratch, sim_folder_home, dirs_exist_ok=True)  # will rewrite the folder
        else:
            print(f'status: {status} ==> will remove {sim_folder_scratch}')
            try_to_remove_folder(sim_folder_scratch)
    else:
        print(f'status: {status} ==> will copy failed sim folder from scratch')
        #if not os.path.exists(sim_folder_home):
        #os.mkdir(sim_folder_home)   # will overwrite if exists
        try:
            copytree(sim_folder_scratch, sim_folder_home)  # will rewrite the folder?
            print(f"I have copied {sim_folder_scratch} to {sim_folder_home}")
        except:
            print(f"I could not copy {sim_folder_scratch} to {sim_folder_home}")
        try_to_remove_folder(sim_folder_scratch)