def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath): """Load dataset and the corresponding vectorizer. Used in the case in the vectorizer has been cached for re-use Args: surname_csv (str): location of the dataset vectorizer_filepath (str): location of the saved vectorizer Returns: an instance of SurnameDataset """ surname_df = pd.read_csv(path_merge(surname_csv)) vectorizer = cls.load_vectorizer_only(path_merge(vectorizer_filepath)) return cls(surname_df, vectorizer)
def save_vectorizer(self, vectorizer_filepath): """saves the vectorizer to disk using json Args: vectorizer_filepath (str): the location to save the vectorizer """ with open(path_merge(vectorizer_filepath), "w") as fp: json.dump(self._vectorizer.to_serializable(), fp)
def load_dataset_and_make_vectorizer(cls, surname_csv): """Load dataset and make a new vectorizer from scratch Args: surname_csv (str): location of the dataset Returns: an instance of SurnameDataset """ surname_df = pd.read_csv(path_merge(surname_csv)) train_surname_df = surname_df[surname_df.split == 'train'] return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))
import torch.optim as optim from Utils.dataset import SurnameDataset,generate_batches from NeuralNetwork.Net import SurnameClassifier from Utils.tools import set_seed_everywhere,handle_dirs,path_merge from Utils.judge_state import compute_accuracy,update_train_state,make_train_state from tqdm import tqdm_notebook if not torch.cuda.is_available(): args.cuda = False args.device = torch.device("cuda" if args.cuda else "cpu") print("Using CUDA: {}".format(args.cuda)) if args.expand_filepaths_to_save_dir: args.vectorizer_file = path_merge(args.vectorizer_file) args.model_state_file = path_merge(args.model_state_file) # Set seed for reproducibility set_seed_everywhere(args.seed, args.cuda) # handle dirs handle_dirs(path_merge(args.save_dir)) if args.reload_from_files and os.path.exists(args.vectorizer_file): # training from a checkpoint dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv, args.vectorizer_file) else:
# @Describe: import torch from Utils.dataset import SurnameDataset from Config.config import args from NeuralNetwork.Net import SurnameClassifier from Utils.tools import path_merge dataset = SurnameDataset.load_dataset_and_load_vectorizer( args.surname_csv, args.vectorizer_file) vectorizer = dataset.get_vectorizer() classifier = SurnameClassifier(embedding_size=args.char_embedding_size, num_embeddings=len(vectorizer.char_vocab), num_classes=len(vectorizer.nationality_vocab), rnn_hidden_size=args.rnn_hidden_size, padding_idx=vectorizer.char_vocab.mask_index) classifier.load_state_dict(torch.load(path_merge(args.model_state_file))) def predict_nationality(surname, classifier, vectorizer): vectorized_surname, vec_length = vectorizer.vectorize(surname) vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0) vec_length = torch.tensor([vec_length], dtype=torch.int64) result = classifier(vectorized_surname, vec_length, apply_softmax=True) probability_values, indices = result.max(dim=1) index = indices.item() prob_value = probability_values.item() predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)