def make_dataset(self):
     t = len(self.vasp_dirs)
     for i in range(t):
         print_file("Process For generating dataset: %s / %s" % (i, t))
         #print("Process for %s" % self.vasp_dirs[i])
         self.__make_one_dataset(self.vasp_dirs[i])
     self.b_make_dataset = 1
    def train_group_by_dataset_from():  # 根据数据来源(vasp文件夹)划分,训练时容易过拟合
        index = 0
        for i in range(epoch):  # 这里用while True也行,因为每次fit都会保存weights
            # print_file(">>Loop %s/%s"%(i+1,repeat))
            print_file(">>Loop %s" % (index))

            for dataset_index in range(len(total_train_feed_x)):
                print_file(">>>>Train for %s/%s" %
                           (dataset_index + 1, len(total_train_feed_x)))
                nn.fit(total_train_feed_x[dataset_index],
                       total_train_feed_y[dataset_index],
                       epoch=1000,
                       load_weights=True)
            index += 1
def prepare_data_set(vasp_dir_path):
    aim_vasp_path = vasp_dir_path  # "S:\数据集\碳纳米管掺杂\\5-5\\b\oh"

    print_file(
        ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>New Game Begin!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
    )
    print_file("Start Data Collecting")

    dataset_maker = DatasetMaker(aim_vasp_path)
    dataset_maker.make_dataset()
    total_info = dataset_maker.give_out_dataset()
    print_file("Finished Data Collecting, Start Feature Transform")
    print(total_info)
    # 坐标编码
    dataset_offer = DatasetOffer(total_data_info=total_info)
    total_train_feed_x, \
    total_test_feed_x, \
    total_train_feed_y, \
    total_test_feed_y, \
    atom_cases, \
    n_feat = \
        dataset_offer.ANI_transform(save_pkl_path="ANI_features.pkl")
    print(total_train_feed_x)
    print(total_train_feed_x.key)
    exit()
    print(total_train_feed_x)
    def __make_one_dataset(self, vasp_dir):
        test = VASP_DataExtract(vasp_dir=vasp_dir)
        test.get_atom_and_position_info()
        a = test.get_output_as_atom3Dspace()

        if len(a.atoms_pos_info) <= 4:  # 如果样本不够,这是很可能出现的
            print_file("No enough samples for %s, which have %s." %
                       (vasp_dir, len(a.atoms_pos_info)))
            del self.total_info[vasp_dir]
            return

        print_file("vasp_dir %s have sample %s" %
                   (vasp_dir, len(a.atoms_pos_info)))

        self.total_info[vasp_dir]["generated"] = 1
        # 这里的x y不是坐标而是坐标x和能量y
        self.total_info[vasp_dir]['x'], self.total_info[vasp_dir][
            'y'], atom_cases = a.generate_data()
        self.atom_cases = self.atom_cases.union(atom_cases)
        print("AtomCases", self.atom_cases)

        self.total_info[vasp_dir]['atom_cases'] = self.atom_cases
    def train_shuffle_by_batch_size(batch_size=32):
        X = {}
        y = np.array([])
        # key随便来源一个数据集
        atom_cases = list(total_train_feed_x[0].keys())
        print(atom_cases)
        for dataset_index in range(len(total_train_feed_x)):
            print("Dataset have samples: %s" %
                  len(total_train_feed_y[dataset_index]))
            '''
            total_train_feed_x是一个List
            每个例如以O C H 等元素为Key,之后为样本数量,原子数目和Feature长度!所以应当分别截取各个元素的前batchsize个
            
            
            
            '''

            # 进行解包数据聚合,需要内存较大,如果样本来源很多需要修改处理方法
            # 如果原子数目不一样,不能使用这个方法或者需要进行改进
            for atom in atom_cases:
                try:
                    X[atom] = np.concatenate(
                        [total_train_feed_x[dataset_index][atom], X[atom]],
                        axis=0)
                except KeyError:
                    X[atom] = total_train_feed_x[dataset_index][atom]

            y = np.concatenate(
                [y, total_train_feed_y[dataset_index].reshape(-1)])

        print("Y: ", y.shape)
        print("X: ")
        for atom in atom_cases:
            print(X[atom].shape)  # 这里需要和前面输出的total data sample一起看是否有遗漏

        # 进行shuffle
        shuffle_index = np.array(list(range(y.shape[0])))
        print(np.random.shuffle(shuffle_index))

        for atom in atom_cases:
            X[atom] = X[atom][shuffle_index]
        print("Before Shuffle: ", y[:10])
        y = y[shuffle_index]
        print("After shuffle: ", y[:10])
        sample_numbers = y.shape[0]

        # 开始重新分批次,每次一个batchsize
        batch_num = int(sample_numbers / batch_size) + 1
        newX = {}
        newY = {}
        '''
        这里和之前的数据结构不同了,之前是list,现在是dict
        
        '''
        for i in range(batch_num - 1):

            for atom in atom_cases:
                try:
                    newX[i][atom] = X[atom][i * batch_size:1 * batch_size +
                                            batch_size]
                except:
                    newX[i] = {}
                    newX[i][atom] = X[atom][i * batch_size:1 * batch_size +
                                            batch_size]

            newY[i] = y[i * batch_size:1 * batch_size + batch_size]
        # 还剩下一个残余的batch
        newX[batch_num - 1] = X[atom][(batch_num - 1) * batch_size:]
        newY[batch_num - 1] = y[batch_num * batch_size:]

        # 剩下的训练就和之前的一样了
        X = newX
        y = newY
        # 这里再显示一下信息

        index = 0
        while 1:
            index += 1
            print_file(">>Loop %s" % (index))
            for dataset_index in X:
                print(X[dataset_index])
                print(y[dataset_index])
                nn.fit(X[dataset_index],
                       y[dataset_index],
                       epoch=1000,
                       load_weights=True)
def load_pkl_file_to_train(ANI_pkl_file_path="ANI_features.pkl", epoch=100):
    with open(ANI_pkl_file_path, "rb") as f:
        # _代表不使用train的数据
        total_train_feed_x, total_test_feed_x, total_train_feed_y, total_test_feed_y, atom_cases, n_feat \
            = pickle.load(f)
    nn = FullAtomModel(atom_cases, os.getcwd() + "/model", n_feat)
    try:
        nn.load_atom_weights()
    except:
        print_file("Load Weights Failed")

    string = 'Total feed X shape: \n'
    string += "Train: "

    for i in total_train_feed_x:
        string += "\n"
        for j in i:
            print(i[j])
            string += j + ":" + str(i[j].shape)
    string += "\n"

    string += "Test: "
    for i in total_test_feed_x:
        string += "\n"
        for j in i:
            string += j + ":" + str(i[j].shape)
    string += "\n"
    print_file(string)

    def train_group_by_dataset_from():  # 根据数据来源(vasp文件夹)划分,训练时容易过拟合
        index = 0
        for i in range(epoch):  # 这里用while True也行,因为每次fit都会保存weights
            # print_file(">>Loop %s/%s"%(i+1,repeat))
            print_file(">>Loop %s" % (index))

            for dataset_index in range(len(total_train_feed_x)):
                print_file(">>>>Train for %s/%s" %
                           (dataset_index + 1, len(total_train_feed_x)))
                nn.fit(total_train_feed_x[dataset_index],
                       total_train_feed_y[dataset_index],
                       epoch=1000,
                       load_weights=True)
            index += 1

    def train_shuffle_by_batch_size(batch_size=32):
        X = {}
        y = np.array([])
        # key随便来源一个数据集
        atom_cases = list(total_train_feed_x[0].keys())
        print(atom_cases)
        for dataset_index in range(len(total_train_feed_x)):
            print("Dataset have samples: %s" %
                  len(total_train_feed_y[dataset_index]))
            '''
            total_train_feed_x是一个List
            每个例如以O C H 等元素为Key,之后为样本数量,原子数目和Feature长度!所以应当分别截取各个元素的前batchsize个
            
            
            
            '''

            # 进行解包数据聚合,需要内存较大,如果样本来源很多需要修改处理方法
            # 如果原子数目不一样,不能使用这个方法或者需要进行改进
            for atom in atom_cases:
                try:
                    X[atom] = np.concatenate(
                        [total_train_feed_x[dataset_index][atom], X[atom]],
                        axis=0)
                except KeyError:
                    X[atom] = total_train_feed_x[dataset_index][atom]

            y = np.concatenate(
                [y, total_train_feed_y[dataset_index].reshape(-1)])

        print("Y: ", y.shape)
        print("X: ")
        for atom in atom_cases:
            print(X[atom].shape)  # 这里需要和前面输出的total data sample一起看是否有遗漏

        # 进行shuffle
        shuffle_index = np.array(list(range(y.shape[0])))
        print(np.random.shuffle(shuffle_index))

        for atom in atom_cases:
            X[atom] = X[atom][shuffle_index]
        print("Before Shuffle: ", y[:10])
        y = y[shuffle_index]
        print("After shuffle: ", y[:10])
        sample_numbers = y.shape[0]

        # 开始重新分批次,每次一个batchsize
        batch_num = int(sample_numbers / batch_size) + 1
        newX = {}
        newY = {}
        '''
        这里和之前的数据结构不同了,之前是list,现在是dict
        
        '''
        for i in range(batch_num - 1):

            for atom in atom_cases:
                try:
                    newX[i][atom] = X[atom][i * batch_size:1 * batch_size +
                                            batch_size]
                except:
                    newX[i] = {}
                    newX[i][atom] = X[atom][i * batch_size:1 * batch_size +
                                            batch_size]

            newY[i] = y[i * batch_size:1 * batch_size + batch_size]
        # 还剩下一个残余的batch
        newX[batch_num - 1] = X[atom][(batch_num - 1) * batch_size:]
        newY[batch_num - 1] = y[batch_num * batch_size:]

        # 剩下的训练就和之前的一样了
        X = newX
        y = newY
        # 这里再显示一下信息

        index = 0
        while 1:
            index += 1
            print_file(">>Loop %s" % (index))
            for dataset_index in X:
                print(X[dataset_index])
                print(y[dataset_index])
                nn.fit(X[dataset_index],
                       y[dataset_index],
                       epoch=1000,
                       load_weights=True)

    train_group_by_dataset_from()
    #train_shuffle_by_batch_size()

    nn.save_atom_weights()
def load_soapml_feature_to_train():
    '''

    input: dataset X, dataset y
    这里的问题是
    编码可能使用C Pt N原子,但是被编码的是H 和O,即H和O作为中心,C Pt N作为环境
    这样的话,默认所有原子作为中心和环境的AtomNN就有歧义,需要注意应用范围!


    :return:
    '''

    with open("S:\FTP\数据集\碳纳米管掺杂\\atomNNDataset.pkl", "rb") as f:

        x, y, center_atom_cases, encode_atom_cases, feature_num = pickle.load(
            f)
    print(x, y)
    center_atom_cases = [
        atom_index_trans_reverse[i] for i in center_atom_cases
    ]
    encode_atom_cases = [
        atom_index_trans_reverse[i] for i in encode_atom_cases
    ]

    new_x = []
    for sample in x:
        tmp = {}
        for key in sample:
            #print(atom_index_trans_reverse[key])
            tmp[atom_index_trans_reverse[key]] = sample[key]
        new_x.append(tmp)
    pass
    x = new_x

    def split_dataset(x, y, ratio=0.3):
        trainX = []
        testX = []
        trainY = []
        testY = []
        for sample_index in range(len(x)):
            sample_x = x[sample_index]
            sample_y = y[sample_index]
            # dataset中sample小于10的不要
            if len(sample_y) <= 10:
                continue
            split_num = int(len(sample_y) * (1 - ratio))
            trainY.append(sample_y[:split_num])
            testY.append(sample_y[split_num:])
            train = {}
            test = {}
            for key in sample_x:
                train[key] = sample_x[key][:split_num, :, :]
                test[key] = sample_x[key][split_num:, :, :]
            trainX.append(train)
            testX.append(test)
        return trainX, testX, trainY, testY

    trainX, testX, trainY, testY = split_dataset(x, y, 0.3)
    pass
    # 注意使用center atom cases,
    nn = FullAtomModel(center_atom_cases, os.getcwd() + "/model", feature_num)
    try:
        nn.load_atom_weights()
    except:
        pass

    epoch = 100

    index = 0
    for i in range(epoch):  # 这里用while True也行,因为每次fit都会保存weights
        # print_file(">>Loop %s/%s"%(i+1,repeat))
        print_file(">>Loop %s" % (index))

        for dataset_index in range(len(trainX)):
            print_file(">>>>Train for %s/%s" %
                       (dataset_index + 1, len(trainX)))
            nn.fit(trainX[dataset_index],
                   trainY[dataset_index],
                   epoch=1000,
                   load_weights=True,
                   save_weights=True)
        index += 1