def process(self, X): # нарезаем на куски, ибо кормить в сеть слишком большой не можем. split_threshold = self.model.split_threshold text_split = split_long_texts(X, split_threshold) # теперь index - номер произведения, и он дублируется # предобработка на уровне слов. # извлекаем из модели параметры и использовавшиеся методы. nm = self.model.normalizer words_tokenizer = self.model.word_tokenizer MAX_TEXT_WORDS = self.model.params['MAX_TEXT_WORDS'] # препроцессим вход. filtered_data = filter_chars(text_split) text_word = nm.normalize(filtered_data) text_word = preprocessing(text_word, encode=words_tokenizer.texts_to_sequences, inputlen=MAX_TEXT_WORDS) # предобработка на уровне символов ... return (text_word, text_char)
"--train", action="store_true", help="Run the training of the model") parser.add_argument( "-p", "--preprocess", action="store_true", help= "Update the train and test csv files with the new images in dataset, used this if you added new images in dataset" ) args = parser.parse_args() if args.preprocess: print("Preprocessing..") preprocessing() print("Preprocessing finished!") cuda_available = torch.cuda.is_available() # directory results if not os.path.exists(RESULTS_PATH): os.makedirs(RESULTS_PATH) # Load dataset mean = m std_dev = s transform_train = transforms.Compose([ transforms.RandomApply([transforms.ColorJitter(0.1, 0.1, 0.1, 0.1)], p=0.5),
import tensorflow as tf import config as cfg import dataset from model import build_model from tensorflow.keras.callbacks import TensorBoard import os os.environ['CUDA_VISIBLE_DEVICES'] = '4, 5, 6, 7' physical_devices = tf.config.experimental.list_physical_devices('GPU') dense_features, sparse_features, total_data = dataset.preprocessing() tbCallBack = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, update_freq='batch', embeddings_layer_names=None, embeddings_metadata=None) total_data = total_data.sample(frac=1.0, random_state=1) train_data = total_data.iloc[:500000] val_data = total_data.iloc[500000:] train_dense = [train_data[f].values for f in dense_features] train_sparse = [train_data[f].values for f in sparse_features]