Example #1
0
    def load_dataset_and_load_vectorizer(cls, surname_csv,
                                         vectorizer_filepath):
        """Load dataset and the corresponding vectorizer.
        Used in the case in the vectorizer has been cached for re-use

        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        surname_df = pd.read_csv(path_merge(surname_csv))
        vectorizer = cls.load_vectorizer_only(path_merge(vectorizer_filepath))
        return cls(surname_df, vectorizer)
Example #2
0
    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json

        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(path_merge(vectorizer_filepath), "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
Example #3
0
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        """Load dataset and make a new vectorizer from scratch

        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        surname_df = pd.read_csv(path_merge(surname_csv))
        train_surname_df = surname_df[surname_df.split == 'train']
        return cls(surname_df,
                   SurnameVectorizer.from_dataframe(train_surname_df))
Example #4
0
import torch.optim as optim
from Utils.dataset import SurnameDataset,generate_batches
from NeuralNetwork.Net import SurnameClassifier
from Utils.tools import set_seed_everywhere,handle_dirs,path_merge
from Utils.judge_state import compute_accuracy,update_train_state,make_train_state
from tqdm import tqdm_notebook

if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = path_merge(args.vectorizer_file)

    args.model_state_file = path_merge(args.model_state_file)

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(path_merge(args.save_dir))


if args.reload_from_files and os.path.exists(args.vectorizer_file):
    # training from a checkpoint
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv,
                                                              args.vectorizer_file)
else:
# @Describe:
import torch
from Utils.dataset import SurnameDataset
from Config.config import args
from NeuralNetwork.Net import SurnameClassifier
from Utils.tools import path_merge

dataset = SurnameDataset.load_dataset_and_load_vectorizer(
    args.surname_csv, args.vectorizer_file)
vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(embedding_size=args.char_embedding_size,
                               num_embeddings=len(vectorizer.char_vocab),
                               num_classes=len(vectorizer.nationality_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.char_vocab.mask_index)
classifier.load_state_dict(torch.load(path_merge(args.model_state_file)))


def predict_nationality(surname, classifier, vectorizer):
    vectorized_surname, vec_length = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)

    result = classifier(vectorized_surname, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)

    index = indices.item()
    prob_value = probability_values.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)