def data_gathering(dt, dm, fname, job): total_images = ase.io.read(fname, index=':') # extxyz, OUTCAR, ':' should be ### divide total data innot nset: nset-1 for training and last 1 for test if dt == 'set': if isinstance(dm, int): nset = dm elif isinstance(dm, list): nset = dm[0] images_sets = Images(total_images, dt, nset) training_images = images_sets.get_training_images() test_images = images_sets.get_test_images() ### indices for training d[0:2] and test d[len(d)-2:]: not call class Images elif dt == 'interval': d_list = dm[:2] training_images = total_images[d_list[0]:d_list[1]] if len(dm) >= 3: if len(data_int) == 3: d_list = dm[1:] elif len(data_int) == 4: d_list = dm[2:] test_images = total_images[d_list[0]:d_list[1]] else: test_images = [] print("There is no test set region ") ### select some for training and some for test turn by turn in the file elif dt == 'sort': training_images = [] test_images = [] i = 0 divider = dm[0] tr_remain = dm[1] te_remain = dm[2] for image in len(total_images): if i % divider == 0: training_images.append(total_images[i]) else: test_images.append(total_images[i]) i += 1 return len(total_images), training_images, test_images
def data_selection(total_images, dt, dl, job): if dt == 'npart': if isinstance(dl, int): nset = dl elif isinstance(dl, list): nset = dl[0] images_sets = Images(total_images, dt, nset) training_images = images_sets.get_training_images() test_images = images_sets.get_test_images() return training_images, test_images ### indices for training d[0:2] and test d[len(d)-2:]: not call class Images, job is used elif dt == 'int': d_list = dl[:2] training_images = total_images[d_list[0]:d_list[1]] if len(dl) >= 3: if len(dl) == 3: d_list = dl[1:] elif len(dl) == 4: d_list = dl[2:] test_images = total_images[d_list[0]:d_list[1]] else: test_images = [] print("There is no test set region ") if job == 'tr': return training_images, test_images ### one interval will be test region elif job == 'te': return None, training_images ### Division by index: some for training and some for test turn by turn in the file elif dt == 'div': training_images = [] test_images = [] i = 0 divider = dl[0] tr_remainder = dl[1] # print("Wrong in selection data u. -dt 'div'") # sys.exit(44) if len(dl) == 3: te_remainder = dl[2] #te_remain = dl[2] for image in total_images: if i % divider == tr_remainder: training_images.append(image) if Ldebug: print(f"{i}-th image in training_images") if len(dl) == 3: if i % divider == te_remainder: test_images.append(image) if Ldebug: print(f"{i}-th image in test_images") i += 1 if job == 'te': test_images = training_images training_images = None return training_images, test_images elif dt == 'pick': training_images = [] test_images = [] i = 0 j = 0 if len(dl) == 2: # Nontype error for dl, why? for image in total_images: if i < dl[0]: training_images.append(image) if Ldebug: print(f"{j}-th image in training_images") elif i < dl[0] + dl[1]: test_images.append(image) if Ldebug: print(f"{j}-th image in test_images") else: training_images.append(image) if Ldebug: print(f"{j}-th image in training_images") i = 0 i += 1 j += 1 return training_images, test_images else: return None, None
def amp_jobs(fdata, job, nsets, HL, E_conv, Lgraph, ival_set): total_images = ase.io.read(fdata, index=':') images_sets = Images(total_images, nsets) if re.search("pr", job): y = [] for mol in total_images: y.append(mol.get_potential_energy()) mplot_nvector([], y, fdata.split(".")[0], 'sample', 'E(eV)') ### job == training elif re.search("tr", job): images = images_sets.get_training_images() print("data training:total sets %d/%d" % (len(images), len(total_images))) exe_train_images(images, HL, E_conv) ### job == training & test - test can be done at once by commenting one line below amp_pes = "amp.amp" images = images_sets.get_test_images() title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images), len(images)) print("data test:total sets %d/%d" % (len(images), len(total_images))) exe_test_images(job, images, amp_pes, title, suptitle, Lgraph) ### only test elif re.search("te", job): amp_pes = "amp.amp" images = images_sets.get_test_images() title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images), len(images)) print("data test:total sets %d/%d" % (len(images), len(total_images))) exe_test_images(job, images, amp_pes, title, suptitle, Lgraph) ### job == validation elif re.search("va", job): print("validation test") print("data images are diveded into %d sets" % nsets) ### training set scan for valicaiotn #for i in [0,1,2,3]: #range(nsets-1): # last one [4] is kept for test # ival_set should be lower than nsets-1 if ival_set is None: print( "index for validation set is reguired with '-i num' between 0 ~ {}" .format(nsets - 2)) sys.exit(0) else: if ival_set >= nsets - 1: print("validation set index should be lower than {}".format( nsets - 1)) print("refer to py_ai_ini.py -j amp") sys.exit(3) fname = fdata.split(".")[0] hl = ''.join(str(x) for x in HL) fname += hl + str(E_conv) + ".val" #for i in range(nsets-1): # last one is kept for test, this is not working at the moment for i in [ival_set]: ### training images, img_valid = images_sets.get_val_train_images(i) print("num images: training {} validation {}".format( len(images), len(img_valid))) exe_train_images(images, HL, E_conv) ### validating amp_pes = "amp.amp" title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images), len(images)) rmserr = exe_test_images(job, img_valid, amp_pes, title, suptitle, Lgraph, val_id=i) with open(fname, "a") as f: f.write("{}: {:5.3f}\n".format(ival_set, rmserr)) # check divided image sets: plot 2d here if False: x_draw = [] y_draw = [] for n, atoms in enumerate(images): pot = atoms.get_potential_energy() x_draw.append(n) y_draw.append(pot) mplot_vector_two(x_draw, y_draw, Title="Extracted Training Set %d" % i, Xtitle="serial number", Ytitle="Epot") elif re.search('md', job): #print("nsets is used for start geometry") if not nsets: atoms = ase.io.read(fdata, index='0') else: atoms = ase.io.read(fdata, index=nsets) run_md(atoms) return
def amp_jobs(fdata, job, data_int, amp_pes, HL, E_conv, f_conv, Lgraph, ncore, n_mol, Ltwinx): total_images = ase.io.read(fdata, index=':') # can read extxyz, OUTCAR, images_sets = Images(total_images, nsets=data_int) #if not os.path.isfile(amp_pes): if re.search("pr", job): y = [] for mol in total_images: y.append(mol.get_potential_energy()) if fdata.endswith('extxyz'): mplot_nvector([], y, fdata.split(".")[0], 'sample', 'E(eV)') elif fdata == "OUTCAR": mplot_nvector([], y, Xtitle='sample', Ytitle='E(eV)') ### JOB == TRAINING elif re.search("tr", job): if isinstance(data_int, int): images = images_sets.get_training_images() else: d_list = data_int[:2] images = images_sets.get_training_images(d_list=d_list) print("data training:total sets %d/%d" % (len(images), len(total_images))) calc_train_images(images, HL, E_conv, f_conv, ncore) ### test after training:: Do not turn on in qsub server = socket.gethostname() if server == 'chi': if isinstance(data_int, int): images = images_sets.get_test_images() else: if len(data_int) >= 3: if len(data_int) == 3: d_list = data_int[1:] else: d_list = data_int[2:] images = images_sets.get_test_images(d_list=d_list) else: print("There is no test set region in -di ") title, suptitle = get_title(fdata, HL, E_conv, f_conv, len(total_images), len(images)) print("data test:total sets %d/%d" % (len(images), len(total_images))) rmserr, max_res = calc_test_images(job, images, amp_pes, title, suptitle, Lgraph, ncore, Ltwinx=Ltwinx) f_write(fdata, HL, E_conv, f_conv, rmserr, max_res, job) ### JOB == TEST elif re.search("te", job): if isinstance(data_int, int): if data_int == 0: images = total_images else: images = images_sets.get_test_images() ### for data interval else: images = images_sets.get_test_images(d_list=data_int) title, suptitle = get_title(fdata, HL, E_conv, f_conv, len(total_images), len(images)) print("data test:total sets %d/%d" % (len(images), len(total_images))) rmserr, max_res = calc_test_images(job, images, amp_pes, title, suptitle, Lgraph, ncore, nmol=n_mol, Ltwinx=Ltwinx) f_write(fdata, HL, E_conv, f_conv, rmserr, max_res, job) elif re.search('md', job): # use first geometry atoms = ase.io.read(fdata, index='0') run_md(atoms) return
def amp_jobs(fdata, job, ndata, HL, E_conv): total_images = ase.io.read(fdata, index=':') images_c = Images(total_images) ### job == training if re.search("tr", job): if not ndata: images = images_c.total_images print("Start training using all the data %d" % len(images)) else: images = images_c.get_training_images(ndata) print("data training:total sets %d/%d" % (len(images), len(total_images))) exe_train_images(images, HL, E_conv) ### job == test elif re.search("te", job): amp_pes = "amp.amp" images = images_c.get_test_images(ndata) title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images), len(images)) print("data test:total sets %d/%d" % (len(images), len(total_images))) exe_test_images(job, images, amp_pes, title, suptitle) ### job == validation elif re.search("va", job): print("validation test") if not ndata: ndata = 5 print("data images are diveded into %d sets" % ndata) ### training set scan for valicaiotn for i in range(ndata - 1): # last one is kept for test ### training images, img_valid = images_c.get_val_train_images(ndata, i) print("num images: training {} validation {}".format( len(images), len(img_valid))) exe_train_images(images, HL, E_conv) ### validating amp_pes = "amp.amp" title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images), len(images)) #exe_test_images(job, img_valid, amp_pes, title, suptitle, val_id=i) ### Alternative Ways calc = Amp.load(amp_pes) y = [] y_bar = [] for mol in img_valid: y.append(mol.get_potential_energy()) mol.set_calculator(calc) y_bar.append(mol.get_potential_energy()) ''' err = rmse(y, y_bar) print("in job {}-{}: validation error is {}".format(job,i,err)) ''' # check divided image sets: plot 2d here if False: x_draw = [] y_draw = [] for n, atoms in enumerate(images): pot = atoms.get_potential_energy() x_draw.append(n) y_draw.append(pot) mplot_vector_two(x_draw, y_draw, Title="Extracted Training Set %d" % i, Xtitle="serial number", Ytitle="Epot") elif re.search('md', job): print("ndata is used for start geometry") if not ndata: atoms = ase.io.read(fdata, index='0') else: atoms = ase.io.read(fdata, index=ndata) run_md(atoms) return