def read_ref_coord(self, filename="ref_coord"): """ reference coord. & frozen dimension """ fp = open(filename, "r") # number of site line = fp.readline().strip() n_site = int(line) # title line = fp.readline() coord = [] name = [] frozen = [] # read coord for i in xrange(n_site): line = fp.readline() record = line.split() n_col = len(record[4:]) c = [float(f) for f in record[1:4]] if n_col == 1: ndx = [int(record[4]) for i in xrange(3)] elif n_col == 3: ndx = [int(d) for d in record[4:7]] else: print("Error: read failed, ref_coord, check ref_coord !!!") exit(1) name.append(record[0]) coord.append(c) frozen.append(ndx) self.vars['ref_info'] = \ {'name': name, 'coord': coord, 'frozen': frozen, 'n_site': n_site} tools.dump_data(self.files['ref_internal'], self.vars['ref_info']) return
def dump(self): """ read gjf file; dump template in json format. """ # self.wrt_gau_input(flag="cmp", jobfile="template.gjf") tools.dump_data(self.files['template'], self.template_cmp) return
def dump(self): """ read in gms template file. """ text = self.__remove_comment() mydict = self.__namelist_content(text) self.__namelist_split(mydict) tools.dump_data(self.files['template'], self.template) return
def dump(self): """ read in gms template file. """ text = self.__remove_comment() mydict = self.__namelist_content(text) self.__namelist_split(mydict) tools.dump_data(self.files["template"], self.template) return
def dump(self): """ read input file; dump template in json format. """ self.rd_molpro_input() tools.dump_data(self.files['template'], self.template_cmp) return
def dump(self): """ read gjf file; dump template in json format. """ self.rd_gau_input() # self.wrt_gau_input(flag="cmp", jobfile="template.gjf") tools.dump_data(self.files['template'], self.template_cmp) return
def predict_pro(self, x): tmp_dir = tempfile.mkdtemp() issparse = sps.issparse(x) f_format = "svm" if issparse else "csv" predict_filepath = os.path.abspath( os.path.join(tmp_dir, "x_to_pred.{}".format(f_format))) output_model = os.path.abspath(os.path.join(tmp_dir, "model")) conf_filepath = os.path.join(tmp_dir, "predict.conf") output_results = os.path.abspath( os.path.join(tmp_dir, "LightGBM_predict_result.txt")) with open(output_model, mode="w") as file: file.write(self.model) tools.dump_data(x, np.zeros(x.shape[0]), predict_filepath, issparse) calls = [ "task = predict\n", "data = {}\n".format(predict_filepath), "input_model = {}\n".format(output_model), "output_result={}\n".format(output_results) ] with open(conf_filepath, 'w') as f: f.writelines(calls) process = subprocess.Popen( [self.exec_path, "config={}".format(conf_filepath)], stdout=subprocess.PIPE, bufsize=1) with process.stdout: for line in iter(process.stdout.readline, b''): print(line.strip().decode('utf-8')) if self.verbose else None # wait for the subprocess to exit process.wait() raw_probabilities = np.loadtxt(output_results, dtype=float) if self.param['application'] == 'multiclass': y_prob = raw_probabilities elif self.param['application'] == 'binary': probability_of_one = raw_probabilities probability_of_zero = 1 - probability_of_one y_prob = np.transpose( np.vstack((probability_of_zero, probability_of_one))) else: raise shutil.rmtree(tmp_dir) return y_prob
def dump_layer(self): """ write down oniom input for each layer require additional info. """ oniom = self.oniom for name in oniom.keys(): layer = oniom[name] fname = layer['name'] + "-check.gjf" job = layer['job'] fp = open(fname, "w") for line in job: print >> fp, "%s" % line, fp.close() tools.dump_data('layer.json', self.oniom) return
def predict(self, x): tmp_dir = tempfile.mkdtemp() issparse = sps.issparse(x) f_format = 'svm' if issparse else 'csv' predict_filepath = os.path.abspath( os.path.join(tmp_dir, 'x_to_pred.{}'.format(f_format))) output_model = os.path.abspath(os.path.join(tmp_dir, "model")) output_results = os.path.abspath( os.path.join(tmp_dir, "LightGBM_predict_result.txt")) conf_filepath = os.path.join(tmp_dir, "predict.conf") # 写入模型 到文件中 with open(output_model, mode='w') as file: file.write(self.model) # 将输入数据写入到文件中 tools.dump_data(x, np.zeros(x.shape[0]), predict_filepath, issparse) calls = [ 'task = predict\n', 'data = {}\n'.format(predict_filepath), 'input_model = {}\n'.format(output_model), 'output_result = {}\n'.format(output_results) ] # 写入配置信息calls到文件中 with open(conf_filepath, 'w') as f: f.writelines(calls) # 报错 无法实现预测 process = subprocess.Popen( [self.exec_path, 'config={}'.format(conf_filepath)], stdout=subprocess.PIPE, bufsize=1) with process.stdout: for line in iter(process.stdout.readline, b''): print(line.strip().decode('utf-8')) if self.verbose else None process.wait() # process.communicate() y_pred = np.loadtxt(output_results, dtype=float) shutil.rmtree(tmp_dir) return y_pred
def build_input(self): """ base on the input parameter to build a series of files """ # current interface file data. model = tools.load_data(self.files['interface']) # %charge are kept. and spin varied. n_state = model['parm']['n_state'] i_state = model['parm']['i_state'] filelist = [] for i in xrange(n_state): filename = self.wrt_gjf(i) filelist.append(filename) filelist.reverse() tools.dump_data("filelist.dat", filelist) return filelist
def get_dim_info(self): """ obtain dimension data. such as number of atoms and et al. core orbitals are frozen in the Gaussian TDDFT implementation """ logfile = self.files['log'] if not os.path.isfile(logfile): print "DFT & TD calculation results do not exist!" print "Check the DFT calculation!", logfile exit(1) file_in = open(logfile, "r") for line in file_in: # match # NAtoms= 6 NActive= 6 NUniq= 6 SFac= 7.50D-01 NAtFMM= 80 NAOKFM=F Big=F pat0 = re.compile("NAtoms=(.*)NActive=(.*)NUniq=(.*)SFac=(.*)NAtFMM=(.*)") # NBasis= 38 NAE= 8 NBE= 8 NFC= 2 NFV= 0 pat1 = re.compile("NBasis=(.*)NAE=(.*)NBE=(.*)NFC=(.*)NFV=(.*)") # NROrb= 36 NOA= 6 NOB= 6 NVA= 30 NVB= 30 pat2 = re.compile("NROrb=(.*)NOA=(.*)NOB=(.*)NVA=(.*)NVB=(.*)") # nstates=3 pat3 = re.compile("nstates=(\d)+", re.IGNORECASE) # .. m0 = pat0.search(line) m1 = pat1.search(line) m2 = pat2.search(line) m3 = pat3.search(line) if m0 is not None: string = m0.group() record = string.split() self.dim['n_atom'] = int(record[1]) self.dim['n_active'] = int(record[3]) elif m1 is not None: string = m1.group() record = string.split() self.dim['n_basis'] = int(record[1]) self.dim['neleA'] = int(record[3]) self.dim['neleB'] = int(record[5]) self.dim['nfixcore'] = int(record[7]) self.dim['nfixvir'] = int(record[9]) # guess occ_all self.dim['nocc_allA'] = self.dim['neleA'] self.dim['nvir_allA'] = self.dim['n_basis'] - self.dim['nocc_allA'] self.dim['nocc_allB'] = self.dim['neleB'] self.dim['nvir_allB'] = self.dim['n_basis'] - self.dim['nocc_allB'] #print self.dim['neleA'] elif m2 is not None: string = m2.group() record = string.split() #"$NoccA $NoccB $NvirtA $NvirtB"; self.dim['norb'] = int(record[1]) # number of orbital active self.dim['noccA'] = int(record[3]) self.dim['noccB'] = int(record[5]) self.dim['nvirA'] = int(record[7]) self.dim['nvirB'] = int(record[9]) elif m3 is not None: string = m3.group() record = string.split("=") self.dim['n_state'] = int(record[1]) + 1 # add 1, because of the ground state else: continue file_in.close() tools.dump_data('dimension.json', self.dim) return
def __get_dim_info(self): """ obtain dimension data. such as number of atoms and et al. core orbitals are frozen in the Gaussian TDDFT implementation """ logfile = self.files['mo'] file_in = open(logfile, "r") for line in file_in: # match # NAtoms= 6 NActive= 6 NUniq= 6 SFac= 7.50D-01 NAtFMM= 80 NAOKFM=F Big=F pat0 = re.compile("NAtoms=(.*)NActive=(.*)NUniq=(.*)SFac=(.*)NAtFMM=(.*)") # NBasis= 38 NAE= 8 NBE= 8 NFC= 2 NFV= 0 pat1 = re.compile("NBasis=(.*)NAE=(.*)NBE=(.*)NFC=(.*)NFV=(.*)") # NROrb= 36 NOA= 6 NOB= 6 NVA= 30 NVB= 30 pat2 = re.compile("NROrb=(.*)NOA=(.*)NOB=(.*)NVA=(.*)NVB=(.*)") # nstates=3 pat3 = re.compile("nstates=(\d)+", re.IGNORECASE) pat4 = re.compile("root=(\d)+", re.IGNORECASE) # .. m0 = pat0.search(line) m1 = pat1.search(line) m2 = pat2.search(line) m3 = pat3.search(line) m4 = pat4.search(line) if m0 is not None: string = m0.group() record = string.split() self.dim['n_atom'] = int(record[1]) self.dim['n_active'] = int(record[3]) elif m1 is not None: string = m1.group() record = string.split() self.dim['n_basis'] = int(record[1]) self.dim['neleA'] = int(record[3]) self.dim['neleB'] = int(record[5]) self.dim['nfixcore'] = int(record[7]) self.dim['nfixvir'] = int(record[9]) # guess occ_all self.dim['nocc_allA'] = self.dim['neleA'] self.dim['nvir_allA'] = self.dim['n_basis'] - self.dim['nocc_allA'] self.dim['nocc_allB'] = self.dim['neleB'] self.dim['nvir_allB'] = self.dim['n_basis'] - self.dim['nocc_allB'] #print self.dim['neleA'] elif m2 is not None: string = m2.group() record = string.split() #"$NoccA $NoccB $NvirtA $NvirtB"; self.dim['norb'] = int(record[1]) # number of orbital active self.dim['noccA'] = int(record[3]) self.dim['noccB'] = int(record[5]) self.dim['nvirA'] = int(record[7]) self.dim['nvirB'] = int(record[9]) elif m3 is not None: string = m3.group() record = string.split("=") self.dim['n_state'] = int(record[1]) + 1 # add 1, because of the ground state elif m4 is not None: string = m4.group() record = string.split("=") self.dim['i_state'] = int(record[1]) + 1 else: continue file_in.close() print "DIMENSIONAL INFO DONE" tools.dump_data('dimension.json', self.dim) return
def get_dim_info(self): """ obtain dimension data. such as number of atoms and et al. parser gamess-us log file. """ # default setting myobj = tools.load_data(self.files['interface']) self.dim['n_state'] = myobj['parm']['n_state'] self.dim['i_state'] = myobj['parm']['i_state'] # read logfile = self.files['log'] fp = open(logfile, "r") line = "STARTER" pat = re.compile("TOTAL NUMBER OF BASIS SET SHELLS") while line != "": line = fp.readline() m = pat.search(line) if m is not None: break # shell num. t_line = line # print t_line val = t_line.split("=")[1] n_shell = int(val) # READ THE FOLLOWING LINES # 9 lines # TOTAL NUMBER OF BASIS SET SHELLS = 10 # NUMBER OF CARTESIAN GAUSSIAN BASIS FUNCTIONS = 38 # NUMBER OF ELECTRONS = 14 # CHARGE OF MOLECULE = 0 # SPIN MULTIPLICITY = 1 # NUMBER OF OCCUPIED ORBITALS (ALPHA) = 7 # NUMBER OF OCCUPIED ORBITALS (BETA ) = 7 # TOTAL NUMBER OF ATOMS = 2 # THE NUCLEAR REPULSION ENERGY IS 22.5117346394 # # number of cart gaussian basis functions t_line = fp.readline() val = t_line.split("=")[1] self.dim['n_basis'] = int(val) #print t_line # number of electrons t_line = fp.readline() val = t_line.split("=")[1] self.dim['n_elec'] = int(val) # mol. charge t_line = fp.readline() val = t_line.split("=")[1] charge = int(val) # spin-mult t_line = fp.readline() val = t_line.split("=")[1] spin = int(val) # number-occupied-orbitals-alpha t_line = fp.readline() val = t_line.split("=")[1] self.dim['neleA'] = int(val) # number-occupied-orbitals-beta t_line = fp.readline() val = t_line.split("=")[1] self.dim['neleB'] = int(val) #print line # number-of-atoms t_line = fp.readline() val = t_line.split("=")[1] self.dim['n_atom'] = int(val) # other self.dim['noccA'] = self.dim['neleA'] self.dim['nvirA'] = self.dim['n_basis'] - self.dim['neleA'] self.dim['nvir_allA'] = self.dim['nvirA'] self.dim['nocc_allA'] = self.dim['noccA'] # TDDFT INPUT PARAMETERS pat = re.compile("TDDFT INPUT PARAMETERS") line = "starter" while line != "": line = fp.readline() m = pat.search(line) if m is not None: break line = fp.readline() # reading... # NSTATE= 3 IROOT= 1 MULT= 1 t_line = fp.readline() pat0 = re.compile("NSTATE=(.*)IROOT=(.*)MULT=(.*)") m = pat0.search(t_line) if m is not None: self.dim['n_state'] = int(m.group(1)) + 1 # because of the ground state. self.dim['i_state'] = int(m.group(2)) else: print "<^WARNING> CANNOT FIND TD-DFT INPUT PARAMETERS SETTING: [suppose it to be ground state]" fp.close() tools.dump_data('dimension.json', self.dim) return
def fit(self, x, y, test_data=None, init_scores=[]): tmp_dir = tempfile.mkdtemp() # 创建临时文件 issparse = sps.issparse(x) f_format = 'svm' if issparse else 'csv' train_filepath = os.path.abspath('{}/x.{}'.format( tmp_dir, f_format)) # 训练数据存储位置 init_filepath = train_filepath + '.init' tools.dump_data(x, y, train_filepath, issparse) if len(init_scores) > 0: assert len(init_scores) == x.shape[0] # 控制为样本数量 np.savetxt(init_filepath, x=init_scores, delimiter=',', newline=os.linesep) if test_data: valid = [] for i, (x_test, y_test ) in enumerate(test_data): # test_data = [x_test, y_test] test_filepath = os.path.abspath('{}/x{}_test.{}'.format( tmp_dir, i, f_format)) # 训练数据存储位置 valid.append(test_filepath) tools.dump_data(x_test, y_test, test_filepath, issparse) self.param['valid'] = ','.join(valid) # 验证路径 self.param['task'] = 'train' self.param['data'] = train_filepath self.param['output_model'] = os.path.join(tmp_dir, 'lightgbm_model.txt') calls = ['{}={}\n'.format(k, self.param[k]) for k in self.param] # 训练参数 if self.config == '': # 如果没有输入参数 则读取存储的默认训练参数 conf_filepath = os.path.join(tmp_dir, 'train_conf') # 存储 训练参数 with open(conf_filepath, 'w') as f: f.writelines(calls) process = subprocess.Popen( [self.exec_path, 'config = {}'.format(conf_filepath)], stdout=subprocess.PIPE, bufsize=1) # 打开执行路径,加载训练参数,输出管道PIPE else: process = subprocess.Popen( [self.exec_path, 'config = {}'.format(self.config)], stdout=subprocess.PIPE, bufsize=1) with process.stdout: for line in iter(process.stdout.readline, b''): print(line.strip().decode('utf-8')) if self.verbose else None process.wait() with open(self.param['output_model'], mode='r') as f: # 模型读取赋给self.model self.model = f.read() shutil.rmtree(tmp_dir) # 递归删除 if test_data and self.param['early_stopping_round'] > 0: self.best_round = max( map(int, re.findall('Tree=(\d+)', self.model))) + 1
###################### INITIAL SETUP STEPS ###################### if len(sys.argv) != 5: print "python TraP_source_overview.py <database> <dataset_id> <release> <sigma>" exit() database = sys.argv[1] dataset_id = str(sys.argv[2]) release = str(sys.argv[3]) sigma = float(sys.argv[4]) if release != "0" and release != "1m" and release != "1p": print "This script is for either Cycle0 (0) or Release 1 MonetDB (1m) or Release 1 Postgres (1p) databases, please specify 0, 1m or 1p." exit() ###################### DEFINE SUBROUTINES ###################### tools.dump_data(release, database, dataset_id) transients = [] data = open("ds_" + dataset_id + "_transients.csv", "r") for lines in data: lines = lines.rstrip() transients.append(lines.split(",")) data.close() trans_data = [] for n in range(len(transients)): trans_data.append([transients[n][4], float(transients[n][3]), float(transients[n][10])]) tools.detect_anomaly(trans_data, sigma)