def create_vocab(list_of_dataset_dicts: Dict[str, Dict[str, List[str]]]): """Create a combined vocab from all the datasets""" non_padded_namespaces = ("tag_namespace", "*pos", "*chunk", "*ner", "*ccg") vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) for dataset_dict in list_of_dataset_dicts: for dataset in dataset_dict.values(): params = Params({"non_padded_namespaces": non_padded_namespaces}) vocab.extend_from_instances(params, dataset) return vocab
def create_classification_tagging_vocab( list_of_dataset_dicts: Dict[str, Dict[str, List[str]]]): """Create a combined vocab from all the datasets""" non_padded_namespaces = ("tag_namespace", "*pos", "*chunk", "*ner", "*ccg", "label_namespace", "*sentiment", "*abusive", "*uncertainity") vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) for dataset_dict in list_of_dataset_dicts: for dataset in dataset_dict.values(): params = Params({"non_padded_namespaces": non_padded_namespaces}) vocab.extend_from_instances(params, dataset) return vocab
def create_or_extend_vocab( params: Params, datasets: Dict[str, Dict[str, Iterable[Instance]]], vocabulary_params: Params, vocabulary_path: str, vocab: Vocabulary = None, recover: bool = False, ) -> Vocabulary: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", datasets)) for key in datasets_for_vocab_creation: if key not in datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") datasets = { key: dataset for key, dataset in datasets.items() if key in datasets_for_vocab_creation } flat_datasets = training_util.as_flat_dict(datasets) instance_generator = (instance for key, dataset in flat_datasets.items() for instance in dataset) dataset_keys_to_use_str = ", ".join(datasets_for_vocab_creation) if vocab: logger.info( f"Extending model vocabulary using {dataset_keys_to_use_str} data." ) vocab.extend_from_instances(instances=instance_generator) else: logger.info( "From dataset instances, %s will be considered for vocabulary creation.", dataset_keys_to_use_str, ) if recover and os.path.exists(vocabulary_path): vocab = Vocabulary.from_files( vocabulary_path, vocabulary_params.get("padding_token", None), vocabulary_params.get("oov_token", None), ) else: # Using a generator comprehension here is important because, by being lazy, # it allows us to not iterate over the dataset when directory_path is specified. vocab = Vocabulary.from_params(vocabulary_params, instances=instance_generator) return vocab
def main(): reader = SkipGramReader() dataset = reader.read("data/cv/0/train.txt") vocab = Vocabulary().from_files("data/vocabulary") params = Params(params={}) vocab.extend_from_instances(params, dataset) reader = SkipGramReader(vocab=vocab) dataset = reader.read("data/cv/0/train.txt") embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=dataset, num_epochs=20, cuda_device=CUDA_DEVICE) trainer.train() torch.save(embedding_in.state_dict(), "saved_models/word2vec.th") print(get_synonyms('C', embedding_in, vocab)) print(get_synonyms('G7', embedding_in, vocab)) print(get_synonyms('G', embedding_in, vocab)) print(get_synonyms('F', embedding_in, vocab)) print(get_synonyms('C7', embedding_in, vocab))