def test_extend_from_vocab(self): vocab1 = Vocabulary(non_padded_namespaces={"1", "2"}) vocab2 = Vocabulary(non_padded_namespaces={"3"}) vocab1.add_tokens_to_namespace(["a", "b", "c"], namespace="1") vocab1.add_tokens_to_namespace(["d", "e", "f"], namespace="2") vocab2.add_tokens_to_namespace(["c", "d", "e"], namespace="1") vocab2.add_tokens_to_namespace(["g", "h", "i"], namespace="3") vocab1.extend_from_vocab(vocab2) assert vocab1.get_namespaces() == {"1", "2", "3"} assert vocab1._non_padded_namespaces == {"1", "2", "3"} assert vocab1.get_token_to_index_vocabulary("1") == { "a": 0, "b": 1, "c": 2, "@@PADDING@@": 3, "@@UNKNOWN@@": 4, "d": 5, "e": 6, } assert vocab1.get_token_to_index_vocabulary("2") == { "d": 0, "e": 1, "f": 2, } assert vocab1.get_token_to_index_vocabulary("3") == { "g": 0, "h": 1, "i": 2, }
def __init__(self, vocab: Vocabulary, openai_model_path: str, n_ctx: int = 512, tokens_to_add: List[str] = None, requires_grad: bool = True, clf_token: str = '__clf__', dropout: float = .1, entity_dropout: float = 0.0, language_model_weight: float = .5, selector: str = 'average', label_namespace='labels') -> None: super().__init__(vocab) n_special = len(tokens_to_add) if tokens_to_add is not None else -1 transformer = OpenaiTransformer(model_path=openai_model_path, n_special=n_special, requires_grad=requires_grad, n_ctx=n_ctx) self.embedder = OpenaiTransformerEmbedder(transformer=transformer, top_layer_only=True) self.clf_head = BagClassificationHead( model=transformer, encoder_vocab=vocab.get_token_to_index_vocabulary( 'openai_transformer'), n_class=vocab.get_vocab_size(label_namespace), clf_token=clf_token + '</w>', selector=selector, dropout=dropout) self.lm_head = LanguageModelHead(transformer) self.language_model_weight = language_model_weight self.entity_dropout = entity_dropout self.encoder_vocab = vocab.get_token_to_index_vocabulary( 'openai_transformer') self.del1_token = '__del1__</w>' self.del2_token = '__del2__</w>' self.mask_token = '__mask__</w>' self.na_idx = self.vocab.get_token_to_index_vocabulary('labels')['NA'] self.metrics = { "accuracy": CategoricalAccuracy(), "not_na_accuracy": CategoricalAccuracy() }
def generate_neighbours(vocab, file_name, measure='euc', topk=8, rho=0.6): if vocab is None: tokens = _read_pretrained_tokens(WORD2VECS['counter']) vocab = Vocabulary(tokens_to_add={"tokens": tokens}) embed = read_weight(vocab, "counter", None) emb_util = EmbeddingNbrUtil(embed, vocab.get_token_index, vocab.get_token_from_index) if rho is None: emb_util.pre_search(measure, topk + 1, None) nbr_num = [] ret = {} tokens = list(vocab.get_token_to_index_vocabulary("tokens").keys()) if file_name is None: tokens = random.choices(tokens, k=100) for ele in tqdm(tokens): nbrs = emb_util.find_neighbours(ele, measure, topk + 1, rho, return_words=True) if ele in nbrs: nbrs.remove(ele) ret[ele] = nbrs nbr_num.append(len(nbrs)) print(nbr_num) print('Average neighbour num:', np.mean(nbr_num)) if file_name is None: return json.dump(ret, open(f"external_data/{file_name}", "w"))
def plot_weight(weight_path): weight = torch.load(weight_path) weight = weight.numpy() vocab = Vocabulary().from_files("data/vocabulary") xlabels = ylabels = list(vocab.get_token_to_index_vocabulary()) if not os.path.isdir("figures"): os.makedirs("figures") fn = os.path.basename(weight_path) fn = os.path.splitext(fn)[0] save_confusion_matrix_figure("figures", fn, weight, xlabels, ylabels)
def load_neighbour_words( vocab: Vocabulary, file_name='external_data/counter_fitted_neighbors.json'): nbr_dct = json.load(open(file_name)) tokens = vocab.get_token_to_index_vocabulary() ret = {} for k in nbr_dct: if k in tokens: ret[k] = [] for v in nbr_dct[k]: if v in tokens: ret[k].append(v) ret = defaultdict(lambda: [], ret) return ret
def __init__(self, vocab: Vocabulary, ngram_orders: Union[int, List[int]], max_sentences: Optional[int] = None, max_words: Optional[int] = None, max_bytes: Optional[int] = None, use_porter_stemmer: bool = True, remove_stopwords: bool = False, namespace: str = 'tokens') -> None: super().__init__() if isinstance(ngram_orders, int): ngram_orders = [ngram_orders] self.ngram_orders = ngram_orders self.max_sentences = max_sentences self.max_words = max_words self.max_bytes = max_bytes self.use_porter_stemmer = use_porter_stemmer self.remove_stopwords = remove_stopwords self.python_rouge = PythonRouge() self.vocab = vocab self.namespace = namespace vocab_tokens = vocab.get_token_to_index_vocabulary(namespace) # Extract the special tokens from the vocabulary. We need to check and # ensure each one exists, otherwise we would get the OOV symbol, which # we don't want to skip when converting from indices to strings. self.start_index = None if START_SYMBOL in vocab_tokens: self.start_index = vocab_tokens[START_SYMBOL] self.end_index = None if END_SYMBOL in vocab_tokens: self.end_index = vocab_tokens[END_SYMBOL] self.pad_index = None if DEFAULT_PADDING_TOKEN in vocab_tokens: self.pad_index = vocab_tokens[DEFAULT_PADDING_TOKEN] self.sent_start_index = None if SENT_START_SYMBOL in vocab_tokens: self.sent_start_index = vocab_tokens[SENT_START_SYMBOL] self.sent_end_index = None if SENT_END_SYMBOL in vocab_tokens: self.sent_end_index = vocab_tokens[SENT_END_SYMBOL] self.count = 0 self.totals = {}
def __init__( self, vocab: Vocabulary, vocab_namespace: str = "tokens", projection_dim: int = None, ignore_oov: bool = False, ) -> None: super().__init__() self.vocab = vocab self.vocab_size = vocab.get_vocab_size(vocab_namespace) if projection_dim: self._projection = torch.nn.Linear(self.vocab_size, projection_dim) else: self._projection = None self._ignore_oov = ignore_oov oov_token = vocab._oov_token self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token) if self._oov_idx is None: raise ConfigurationError( "OOV token does not exist in vocabulary namespace {}".format(vocab_namespace) ) self.output_dim = projection_dim or self.vocab_size
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, dropout_p: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.embedding2input = FeedForward( input_dim=word_embeddings.get_output_dim(), num_layers=1, hidden_dims=encoder.get_input_dim(), activations=Activation.by_name('relu')(), dropout=dropout_p) self.encoder = encoder self.hidden2intermediate = FeedForward( input_dim=encoder.get_output_dim(), num_layers=1, hidden_dims=int(encoder.get_output_dim() / 2), activations=Activation.by_name('relu')(), dropout=dropout_p) self.intermediate2tag = nn.Linear( in_features=int(encoder.get_output_dim() / 2), out_features=vocab.get_vocab_size('labels')) # self.accuracy = CategoricalAccuracy() label_vocab = vocab.get_token_to_index_vocabulary('labels').copy() # print("label_vocab: ", label_vocab) [label_vocab.pop(x) for x in ['O', 'OR']] labels_for_metric = list(label_vocab.values()) # print("labels_for_metric: ", labels_for_metric) self.accuracy = CustomFBetaMeasure(beta=1.0, average='micro', labels=labels_for_metric)
def read_dataset(dataset_fp: Path, incl_labels: bool, vocab: Vocabulary) -> List[Dict[str, Any]]: ''' :param dataset_fp: File Path to a list of JSON formatted data :param incl_labels: Wether to add the extra `label_array` key/value :param vocab: Vocab of the model that is going to predict on the data :returns: The data from the dataset with optionally the extra `label_array` key that contains the labels in one hot format. ''' samples = [] token_to_index = vocab.get_token_to_index_vocabulary(namespace='labels') num_labels = vocab.get_vocab_size('labels') with dataset_fp.open('r') as dataset_file: for line in dataset_file: sample = json.loads(line) if incl_labels: labels = sample['labels'] label_array = [0] * num_labels for label in labels: label_index = token_to_index[label] label_array[label_index] = 1 sample['label_array'] = label_array samples.append(sample) return samples
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, variational_autoencoder: FeedForward = None, sentiment_classifier: FeedForward = None, topic_dim: int = 20, freeze_feature_extraction: bool = False, classification_mode: bool = False, pretrained_file: str = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(TopicRNN, self).__init__(vocab, regularizer) self.metrics = { 'cross_entropy': Average(), 'negative_kl_divergence': Average(), 'stopword_loss': Average() } self.classification_mode = classification_mode if classification_mode: self.metrics['sentiment'] = CategoricalAccuracy() if pretrained_file: archive = load_archive(pretrained_file) pretrained_model = archive.model self._init_from_archive(pretrained_model) else: # Model parameter definition. # # Defaults reflect Dieng et al.'s decisions when training their semi-unsupervised # IMDB sentiment classifier. self.text_field_embedder = text_field_embedder self.vocab_size = self.vocab.get_vocab_size("tokens") self.text_encoder = text_encoder self.topic_dim = topic_dim self.vocabulary_projection_layer = TimeDistributed( Linear(text_encoder.get_output_dim(), self.vocab_size)) # Parameter gamma from the paper; projects hidden states into binary logits for whether a # word is a stopword. self.stopword_projection_layer = TimeDistributed( Linear(text_encoder.get_output_dim(), 2)) self.tokens_to_index = vocab.get_token_to_index_vocabulary() # This step should only ever be performed ONCE. # When running allennlp train, the vocabulary will be constructed before the model instantiation, but # we can't create the stopless namespace until we get here. # Check if there already exists a stopless namespace: if so refrain from altering it. if "stopless" not in vocab._token_to_index.keys(): assert self.tokens_to_index[DEFAULT_PADDING_TOKEN] == 0 and \ self.tokens_to_index[DEFAULT_OOV_TOKEN] == 1 for token, _ in self.tokens_to_index.items(): if token not in STOP_WORDS: vocab.add_token_to_namespace(token, "stopless") # Since a vocabulary with the stopless namespace hasn't been saved, save one for convienience. vocab.save_to_files("vocabulary") # Compute stop indices in the normal vocab space to prevent stop words # from contributing to the topic additions. self.stop_indices = torch.LongTensor( [vocab.get_token_index(stop) for stop in STOP_WORDS]) # Learnable topics. # TODO: How should these be initialized? self.beta = nn.Parameter(torch.rand(topic_dim, self.vocab_size)) # mu: The mean of the variational distribution. self.mu_linear = nn.Linear(topic_dim, topic_dim) # sigma: The root standard deviation of the variational distribution. self.sigma_linear = nn.Linear(topic_dim, topic_dim) # noise: used when sampling. self.noise = MultivariateNormal(torch.zeros(topic_dim), torch.eye(topic_dim)) stopless_dim = vocab.get_vocab_size("stopless") self.variational_autoencoder = variational_autoencoder or FeedForward( # Takes as input the word frequencies in the stopless dimension and projects # the word frequencies into a latent topic representation. # # Each latent representation will help tune the variational dist.'s parameters. stopless_dim, 3, [500, 500, topic_dim], torch.nn.ReLU(), ) # The shape for the feature vector for sentiment classification. # (RNN Hidden Size + Inference Network output dimension). sentiment_input_size = text_encoder.get_output_dim() + topic_dim self.sentiment_classifier = sentiment_classifier or FeedForward( # As done by the paper; a simple single layer with 50 hidden units # and sigmoid activation for sentiment classification. sentiment_input_size, 2, [50, 2], torch.nn.Sigmoid(), ) if freeze_feature_extraction: # Freeze the RNN and VAE pipeline so that only the classifier is trained. for name, param in self.named_parameters(): if "sentiment_classifier" not in name: param.requires_grad = False self.sentiment_criterion = nn.CrossEntropyLoss() self.num_samples = 50 initializer(self)
class KNNPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader, vocab_path: str = 'resources/vocab', df_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv', annoy_index_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/index.tree' ) -> None: super().__init__(model.eval(), dataset_reader) self.vocab = Vocabulary().from_files(vocab_path) self.df = pd.read_csv(df_path).set_index("track_id") self.index = None if annoy_index_path: self.build_index(annoy_index_path) def build_index(self, path: str, tracks: List[Tuple[str, np.array]] =None): features = self._model.classifier_feedforward.get_output_dim() if tracks is None: if not os.path.exists(path): path = urlretrieve(path)[0] self.index = AnnoyIndex(features, metric='angular') self.index.load(path) return index = AnnoyIndex(features, metric='angular') for track, vector in tqdm(tracks): i = self.vocab.get_token_to_index_vocabulary("labels")[track] index.add_item(i, vector) index.build(-1) index.save(path) self.index = index def neighbors_to_tracks(self, nns): tracks = [self.vocab.get_token_from_index(i, "labels") for i in nns] return self.df.loc[tracks].reset_index(drop=True).to_dict(orient='records') def predict_json(self, inputs: JsonDict) -> JsonDict: n = inputs.pop('n', 10) if 'track_id' in inputs: if self.index is None: raise AttributeError("Please build an index before searching by track.") idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']] nns = self.index.get_nns_by_item(idx, n+1)[1:] #scores = self.index.get_item_vector(idx) tracks = self.neighbors_to_tracks(nns) return tracks #return {'tracks': tracks, 'scores': scores} instance = self._json_to_instance(inputs) output_dict = self.predict_instance(instance) output_dict['inputs'] = inputs if self.index: logits = output_dict.get('logits') nns = self.index.get_nns_by_vector(logits, n) return self.neighbors_to_tracks(nns) #output_dict['tracks'] = self.neighbors_to_tracks(nns) return output_dict @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: return self._dataset_reader.text_to_instance(text=json_dict['query'])
def __init__(self, vocab: Vocabulary, document_token_embedder: TextFieldEmbedder, encoder: RNN, attention: MatrixAttention, attention_layer: FeedForward, decoder: RNN, bridge: Bridge, beam_search: BeamSearch, run_beam_search: bool = True, summary_token_embedder: Optional[TokenEmbedder] = None, summary_namespace: str = 'tokens', use_input_feeding: bool = False, input_feeding_projection_layer: Optional[FeedForward] = None, instance_loss_normalization: str = 'sum', batch_loss_normalization: str = 'average', metrics: Optional[List[Metric]] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: RegularizerApplicator = None) -> None: super().__init__(vocab, regularizer) self.document_token_embedder = document_token_embedder self.encoder = encoder self.attention = attention self.attention_layer = attention_layer self.decoder = decoder self.bridge = bridge self.beam_search = beam_search self.run_beam_search = run_beam_search self.summary_token_embedder = summary_token_embedder or document_token_embedder._token_embedders[ 'tokens'] self.summary_namespace = summary_namespace self.use_input_feeding = use_input_feeding self.input_feeding_projection_layer = input_feeding_projection_layer self.instance_loss_normalization = instance_loss_normalization self.batch_loss_normalization = batch_loss_normalization # The ``output_layer`` is applied after the attention context and decoder # hidden state are combined. It is used to calculate the softmax over the # summary vocabulary self.output_layer = torch.nn.Linear( decoder.get_output_dim(), vocab.get_vocab_size(summary_namespace)) # Retrieve some special vocabulary token indices. Some of them are # required to exist. token_to_index = vocab.get_token_to_index_vocabulary(summary_namespace) assert START_SYMBOL in token_to_index self.start_index = token_to_index[START_SYMBOL] assert END_SYMBOL in token_to_index self.end_index = token_to_index[END_SYMBOL] assert DEFAULT_PADDING_TOKEN in token_to_index self.pad_index = token_to_index[DEFAULT_PADDING_TOKEN] self.sent_start_index = None if SENT_START_SYMBOL in token_to_index: self.sent_start_index = token_to_index[SENT_START_SYMBOL] self.sent_end_index = None if SENT_END_SYMBOL in token_to_index: self.sent_end_index = token_to_index[SENT_END_SYMBOL] self.loss = torch.nn.CrossEntropyLoss(ignore_index=self.pad_index, reduction='none') # Define the metrics that will be computed self.metrics = metrics self.cross_entropy_metric = CrossEntropyMetric() initializer(self)