Example #1
0
    def read_ref_coord(self, filename="ref_coord"):
        """ reference coord. & frozen dimension """
        fp = open(filename, "r")
        # number of site
        line = fp.readline().strip()
        n_site = int(line)
        # title
        line = fp.readline()
        coord = []
        name = []
        frozen = []
        # read coord
        for i in xrange(n_site):
            line = fp.readline()
            record = line.split()
            n_col = len(record[4:])
            c = [float(f) for f in record[1:4]]
            if n_col == 1:
                ndx = [int(record[4]) for i in xrange(3)]
            elif n_col == 3:
                ndx = [int(d) for d in record[4:7]]
            else:
                print("Error: read failed, ref_coord, check ref_coord !!!")
                exit(1)
            name.append(record[0])
            coord.append(c)
            frozen.append(ndx)
        self.vars['ref_info'] = \
        {'name': name, 'coord': coord, 'frozen': frozen, 'n_site': n_site}

        tools.dump_data(self.files['ref_internal'], self.vars['ref_info'])

        return
Example #2
0
 def dump(self):
     """
     read gjf file; dump template in json format.
     """
     # self.wrt_gau_input(flag="cmp", jobfile="template.gjf")
     tools.dump_data(self.files['template'], self.template_cmp)
     
     return   
Example #3
0
 def dump(self):
     """
     read in gms template file.
     """
     text = self.__remove_comment()
     mydict = self.__namelist_content(text)
     self.__namelist_split(mydict)
     tools.dump_data(self.files['template'], self.template)
     return
Example #4
0
 def dump(self):
     """
     read in gms template file.
     """
     text = self.__remove_comment()
     mydict = self.__namelist_content(text)
     self.__namelist_split(mydict)
     tools.dump_data(self.files["template"], self.template)
     return
Example #5
0
    def dump(self):
        """
        read input file; dump template in json format.
        """
        self.rd_molpro_input()

        tools.dump_data(self.files['template'], self.template_cmp)

        return
Example #6
0
 def dump(self):
     """
     read gjf file; dump template in json format.
     """
     self.rd_gau_input()
     # self.wrt_gau_input(flag="cmp", jobfile="template.gjf")
     tools.dump_data(self.files['template'], self.template_cmp)
     
     return   
Example #7
0
    def predict_pro(self, x):
        tmp_dir = tempfile.mkdtemp()
        issparse = sps.issparse(x)
        f_format = "svm" if issparse else "csv"

        predict_filepath = os.path.abspath(
            os.path.join(tmp_dir, "x_to_pred.{}".format(f_format)))
        output_model = os.path.abspath(os.path.join(tmp_dir, "model"))
        conf_filepath = os.path.join(tmp_dir, "predict.conf")
        output_results = os.path.abspath(
            os.path.join(tmp_dir, "LightGBM_predict_result.txt"))

        with open(output_model, mode="w") as file:
            file.write(self.model)

        tools.dump_data(x, np.zeros(x.shape[0]), predict_filepath, issparse)

        calls = [
            "task = predict\n", "data = {}\n".format(predict_filepath),
            "input_model = {}\n".format(output_model),
            "output_result={}\n".format(output_results)
        ]

        with open(conf_filepath, 'w') as f:
            f.writelines(calls)

        process = subprocess.Popen(
            [self.exec_path, "config={}".format(conf_filepath)],
            stdout=subprocess.PIPE,
            bufsize=1)

        with process.stdout:
            for line in iter(process.stdout.readline, b''):
                print(line.strip().decode('utf-8')) if self.verbose else None
        # wait for the subprocess to exit
        process.wait()

        raw_probabilities = np.loadtxt(output_results, dtype=float)

        if self.param['application'] == 'multiclass':
            y_prob = raw_probabilities

        elif self.param['application'] == 'binary':
            probability_of_one = raw_probabilities
            probability_of_zero = 1 - probability_of_one
            y_prob = np.transpose(
                np.vstack((probability_of_zero, probability_of_one)))
        else:
            raise

        shutil.rmtree(tmp_dir)
        return y_prob
Example #8
0
 def dump_layer(self):
     """
     write down oniom input for each layer
     require additional info.
     """
     oniom = self.oniom
     for name in oniom.keys():
         layer = oniom[name]
         fname = layer['name'] + "-check.gjf"
         job = layer['job']
         fp = open(fname, "w")
         for line in job:
             print >> fp, "%s" % line,
         fp.close()
     tools.dump_data('layer.json', self.oniom)
     return
Example #9
0
    def predict(self, x):
        tmp_dir = tempfile.mkdtemp()
        issparse = sps.issparse(x)
        f_format = 'svm' if issparse else 'csv'

        predict_filepath = os.path.abspath(
            os.path.join(tmp_dir, 'x_to_pred.{}'.format(f_format)))
        output_model = os.path.abspath(os.path.join(tmp_dir, "model"))
        output_results = os.path.abspath(
            os.path.join(tmp_dir, "LightGBM_predict_result.txt"))
        conf_filepath = os.path.join(tmp_dir, "predict.conf")

        # 写入模型 到文件中
        with open(output_model, mode='w') as file:
            file.write(self.model)

        # 将输入数据写入到文件中
        tools.dump_data(x, np.zeros(x.shape[0]), predict_filepath, issparse)

        calls = [
            'task = predict\n', 'data = {}\n'.format(predict_filepath),
            'input_model = {}\n'.format(output_model),
            'output_result = {}\n'.format(output_results)
        ]

        # 写入配置信息calls到文件中
        with open(conf_filepath, 'w') as f:
            f.writelines(calls)

        # 报错 无法实现预测
        process = subprocess.Popen(
            [self.exec_path, 'config={}'.format(conf_filepath)],
            stdout=subprocess.PIPE,
            bufsize=1)

        with process.stdout:
            for line in iter(process.stdout.readline, b''):
                print(line.strip().decode('utf-8')) if self.verbose else None
        process.wait()
        # process.communicate()

        y_pred = np.loadtxt(output_results, dtype=float)
        shutil.rmtree(tmp_dir)

        return y_pred
Example #10
0
    def build_input(self):
        """
        base on the input parameter to build a series of files
        """
        # current interface file data.
        model = tools.load_data(self.files['interface'])
        # %charge are kept. and spin varied.
        n_state = model['parm']['n_state']
        i_state = model['parm']['i_state']

        filelist = []
        for i in xrange(n_state):
            filename = self.wrt_gjf(i)
            filelist.append(filename)
        filelist.reverse()

        tools.dump_data("filelist.dat", filelist)

        return filelist
Example #11
0
    def get_dim_info(self):
        """
        obtain dimension data.
        such as number of atoms and et al.
        core orbitals  are frozen in the Gaussian TDDFT implementation
        """   
        logfile = self.files['log']
        if not os.path.isfile(logfile):
            print "DFT & TD calculation results do not exist!"
            print "Check the DFT calculation!", logfile
            exit(1)
        
        file_in = open(logfile, "r")
        for line in file_in:
            # match
            # NAtoms=    6 NActive=    6 NUniq=    6 SFac= 7.50D-01 NAtFMM=   80 NAOKFM=F Big=F
            pat0 = re.compile("NAtoms=(.*)NActive=(.*)NUniq=(.*)SFac=(.*)NAtFMM=(.*)")
            # NBasis=    38 NAE=     8 NBE=     8 NFC=     2 NFV=     0
            pat1 = re.compile("NBasis=(.*)NAE=(.*)NBE=(.*)NFC=(.*)NFV=(.*)")        
            # NROrb=     36 NOA=     6 NOB=     6 NVA=    30 NVB=    30
            pat2 = re.compile("NROrb=(.*)NOA=(.*)NOB=(.*)NVA=(.*)NVB=(.*)") 
            # nstates=3
            pat3 = re.compile("nstates=(\d)+", re.IGNORECASE)
            # ..
            m0 = pat0.search(line)           
            m1 = pat1.search(line)
            m2 = pat2.search(line)
            m3 = pat3.search(line)
            
            if m0 is not None:
                string = m0.group()
                record = string.split()
                self.dim['n_atom'] = int(record[1])
                self.dim['n_active'] = int(record[3])
                
            elif m1 is not None:
                string = m1.group()
                record = string.split()
                self.dim['n_basis'] = int(record[1])
                self.dim['neleA'] = int(record[3])
                self.dim['neleB'] = int(record[5])
                self.dim['nfixcore'] = int(record[7])
                self.dim['nfixvir'] = int(record[9])                
                # guess occ_all
                self.dim['nocc_allA'] = self.dim['neleA']
                self.dim['nvir_allA'] = self.dim['n_basis'] - self.dim['nocc_allA']
                self.dim['nocc_allB'] = self.dim['neleB']
                self.dim['nvir_allB'] = self.dim['n_basis'] - self.dim['nocc_allB']
                #print self.dim['neleA']

            elif m2 is not None:
                string = m2.group()
                record = string.split()
                #"$NoccA $NoccB $NvirtA $NvirtB";
                self.dim['norb'] = int(record[1]) # number of orbital active
                self.dim['noccA'] = int(record[3])
                self.dim['noccB'] = int(record[5])
                self.dim['nvirA'] = int(record[7])
                self.dim['nvirB'] = int(record[9])
            elif m3 is not None:
                string = m3.group()
                record = string.split("=")
                self.dim['n_state'] = int(record[1]) + 1    # add 1, because of the ground state
            else:
                continue
        
        file_in.close()
                
        tools.dump_data('dimension.json', self.dim)                
        
        return
Example #12
0
    def __get_dim_info(self):
        """
        obtain dimension data.
        such as number of atoms and et al.
        core orbitals  are frozen in the Gaussian TDDFT implementation
        """   
        logfile = self.files['mo']
        file_in = open(logfile, "r")
        for line in file_in:
            # match
            # NAtoms=    6 NActive=    6 NUniq=    6 SFac= 7.50D-01 NAtFMM=   80 NAOKFM=F Big=F
            pat0 = re.compile("NAtoms=(.*)NActive=(.*)NUniq=(.*)SFac=(.*)NAtFMM=(.*)")
            # NBasis=    38 NAE=     8 NBE=     8 NFC=     2 NFV=     0
            pat1 = re.compile("NBasis=(.*)NAE=(.*)NBE=(.*)NFC=(.*)NFV=(.*)")        
            # NROrb=     36 NOA=     6 NOB=     6 NVA=    30 NVB=    30
            pat2 = re.compile("NROrb=(.*)NOA=(.*)NOB=(.*)NVA=(.*)NVB=(.*)") 
            # nstates=3
            pat3 = re.compile("nstates=(\d)+", re.IGNORECASE)
            pat4 = re.compile("root=(\d)+", re.IGNORECASE)
            # ..
            m0 = pat0.search(line)           
            m1 = pat1.search(line)
            m2 = pat2.search(line)
            m3 = pat3.search(line)
            m4 = pat4.search(line)
            
            if m0 is not None:
                string = m0.group()
                record = string.split()
                self.dim['n_atom'] = int(record[1])
                self.dim['n_active'] = int(record[3])
                
            elif m1 is not None:
                string = m1.group()
                record = string.split()
                self.dim['n_basis'] = int(record[1])
                self.dim['neleA'] = int(record[3])
                self.dim['neleB'] = int(record[5])
                self.dim['nfixcore'] = int(record[7])
                self.dim['nfixvir'] = int(record[9])                
                # guess occ_all
                self.dim['nocc_allA'] = self.dim['neleA']
                self.dim['nvir_allA'] = self.dim['n_basis'] - self.dim['nocc_allA']
                self.dim['nocc_allB'] = self.dim['neleB']
                self.dim['nvir_allB'] = self.dim['n_basis'] - self.dim['nocc_allB']
                #print self.dim['neleA']

            elif m2 is not None:
                string = m2.group()
                record = string.split()
                #"$NoccA $NoccB $NvirtA $NvirtB";
                self.dim['norb'] = int(record[1]) # number of orbital active
                self.dim['noccA'] = int(record[3])
                self.dim['noccB'] = int(record[5])
                self.dim['nvirA'] = int(record[7])
                self.dim['nvirB'] = int(record[9])
            elif m3 is not None:
                string = m3.group()
                record = string.split("=")
                self.dim['n_state'] = int(record[1]) + 1    # add 1, because of the ground state
            elif m4 is not None:
                string = m4.group()
                record = string.split("=")
                self.dim['i_state'] = int(record[1]) + 1    
            else:
                continue
        
        file_in.close()
        print "DIMENSIONAL INFO DONE"
        tools.dump_data('dimension.json', self.dim)                

        return
Example #13
0
    def get_dim_info(self):
        """
        obtain dimension data.
        such as number of atoms and et al.
        parser gamess-us log file.
        """
        # default setting
        myobj = tools.load_data(self.files['interface'])
        self.dim['n_state'] = myobj['parm']['n_state']
        self.dim['i_state'] = myobj['parm']['i_state']
        # read 
        logfile = self.files['log']
        fp = open(logfile, "r")

        line = "STARTER"
        pat = re.compile("TOTAL NUMBER OF BASIS SET SHELLS")
        
        while line != "":
            line = fp.readline()
            m = pat.search(line)
            if m is not None:
                break
        # shell num.
        t_line = line
        # print t_line
        val = t_line.split("=")[1]
        n_shell = int(val)     
        
        # READ THE FOLLOWING LINES
        # 9 lines
        # TOTAL NUMBER OF BASIS SET SHELLS             =   10
        # NUMBER OF CARTESIAN GAUSSIAN BASIS FUNCTIONS =   38
        # NUMBER OF ELECTRONS                          =   14
        # CHARGE OF MOLECULE                           =    0
        # SPIN MULTIPLICITY                            =    1
        # NUMBER OF OCCUPIED ORBITALS (ALPHA)          =    7
        # NUMBER OF OCCUPIED ORBITALS (BETA )          =    7
        # TOTAL NUMBER OF ATOMS                        =    2
        # THE NUCLEAR REPULSION ENERGY IS       22.5117346394
        #       
        # number of cart gaussian basis functions
        t_line = fp.readline()
        val = t_line.split("=")[1]
        self.dim['n_basis'] = int(val)
        #print t_line
        # number of electrons
        t_line = fp.readline()
        val = t_line.split("=")[1]
        self.dim['n_elec'] = int(val)

        # mol. charge
        t_line = fp.readline()
        val = t_line.split("=")[1]
        charge = int(val)

        # spin-mult
        t_line = fp.readline()
        val = t_line.split("=")[1]
        spin = int(val)

        # number-occupied-orbitals-alpha
        t_line = fp.readline()
        val = t_line.split("=")[1]
        self.dim['neleA'] = int(val)        

        # number-occupied-orbitals-beta
        t_line = fp.readline()
        val = t_line.split("=")[1]
        self.dim['neleB'] = int(val)                
        #print line
        # number-of-atoms
        t_line = fp.readline()
        val = t_line.split("=")[1]
        self.dim['n_atom'] = int(val)

        # other
        self.dim['noccA'] = self.dim['neleA']
        self.dim['nvirA'] = self.dim['n_basis'] - self.dim['neleA']
        self.dim['nvir_allA'] = self.dim['nvirA']
        self.dim['nocc_allA'] = self.dim['noccA']
                 
        # TDDFT INPUT PARAMETERS
        pat = re.compile("TDDFT INPUT PARAMETERS")
        line = "starter"
        while line != "":
            line = fp.readline()
            m = pat.search(line)
            if m is not None:
                break
        line = fp.readline()
        # reading...
        #   NSTATE=       3  IROOT=       1   MULT=       1
        t_line = fp.readline()
        pat0 = re.compile("NSTATE=(.*)IROOT=(.*)MULT=(.*)")
        m = pat0.search(t_line)
        if m is not None:
            self.dim['n_state'] = int(m.group(1)) + 1 # because of the ground state.
            self.dim['i_state'] = int(m.group(2))
        else:
            print "<^WARNING> CANNOT FIND TD-DFT INPUT PARAMETERS SETTING: [suppose it to be ground state]"
         
        fp.close()
                
        tools.dump_data('dimension.json', self.dim)                

        return
Example #14
0
    def fit(self, x, y, test_data=None, init_scores=[]):
        tmp_dir = tempfile.mkdtemp()  # 创建临时文件
        issparse = sps.issparse(x)
        f_format = 'svm' if issparse else 'csv'

        train_filepath = os.path.abspath('{}/x.{}'.format(
            tmp_dir, f_format))  # 训练数据存储位置
        init_filepath = train_filepath + '.init'
        tools.dump_data(x, y, train_filepath, issparse)

        if len(init_scores) > 0:
            assert len(init_scores) == x.shape[0]  # 控制为样本数量
            np.savetxt(init_filepath,
                       x=init_scores,
                       delimiter=',',
                       newline=os.linesep)

        if test_data:
            valid = []
            for i, (x_test, y_test
                    ) in enumerate(test_data):  # test_data = [x_test, y_test]
                test_filepath = os.path.abspath('{}/x{}_test.{}'.format(
                    tmp_dir, i, f_format))  # 训练数据存储位置
                valid.append(test_filepath)
                tools.dump_data(x_test, y_test, test_filepath, issparse)
            self.param['valid'] = ','.join(valid)  # 验证路径

        self.param['task'] = 'train'
        self.param['data'] = train_filepath
        self.param['output_model'] = os.path.join(tmp_dir,
                                                  'lightgbm_model.txt')
        calls = ['{}={}\n'.format(k, self.param[k])
                 for k in self.param]  # 训练参数

        if self.config == '':  # 如果没有输入参数 则读取存储的默认训练参数
            conf_filepath = os.path.join(tmp_dir, 'train_conf')  # 存储 训练参数
            with open(conf_filepath, 'w') as f:
                f.writelines(calls)

            process = subprocess.Popen(
                [self.exec_path, 'config = {}'.format(conf_filepath)],
                stdout=subprocess.PIPE,
                bufsize=1)  # 打开执行路径,加载训练参数,输出管道PIPE
        else:
            process = subprocess.Popen(
                [self.exec_path, 'config = {}'.format(self.config)],
                stdout=subprocess.PIPE,
                bufsize=1)

        with process.stdout:
            for line in iter(process.stdout.readline, b''):
                print(line.strip().decode('utf-8')) if self.verbose else None
        process.wait()

        with open(self.param['output_model'],
                  mode='r') as f:  # 模型读取赋给self.model
            self.model = f.read()
        shutil.rmtree(tmp_dir)  # 递归删除

        if test_data and self.param['early_stopping_round'] > 0:
            self.best_round = max(
                map(int, re.findall('Tree=(\d+)', self.model))) + 1
###################### INITIAL SETUP STEPS ######################

if len(sys.argv) != 5:
    print "python TraP_source_overview.py <database> <dataset_id> <release> <sigma>"
    exit()
database = sys.argv[1]
dataset_id = str(sys.argv[2])
release = str(sys.argv[3])
sigma = float(sys.argv[4])

if release != "0" and release != "1m" and release != "1p":
    print "This script is for either Cycle0 (0) or Release 1 MonetDB (1m) or Release 1 Postgres (1p) databases, please specify 0, 1m or 1p."
    exit()


###################### DEFINE SUBROUTINES ######################


tools.dump_data(release, database, dataset_id)
transients = []
data = open("ds_" + dataset_id + "_transients.csv", "r")
for lines in data:
    lines = lines.rstrip()
    transients.append(lines.split(","))
data.close()
trans_data = []
for n in range(len(transients)):
    trans_data.append([transients[n][4], float(transients[n][3]), float(transients[n][10])])
tools.detect_anomaly(trans_data, sigma)