def from_word_vectors(cls, word_vectors, unique_labels): """Instantiate the vectorizer""" review_vocab = word_vectors rating_vocab = Indexer() # Add ratings for l in unique_labels: rating_vocab.add_and_get_index(l) return cls(review_vocab, rating_vocab)
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings: """ Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized word embedding files. :param embeddings_file: path to the file containing embeddings :return: WordEmbeddings object reflecting the words and their embeddings """ f = open(embeddings_file) word_indexer = Indexer() vectors = [] # Make position 0 a PAD token, which can be useful if you word_indexer.add_and_get_index("PAD") # Make position 1 the UNK token word_indexer.add_and_get_index("UNK") for line in f: if line.strip() != "": space_idx = line.find(' ') word = line[:space_idx] numbers = line[space_idx + 1:] float_numbers = [ float(number_str) for number_str in numbers.split() ] vector = np.array(float_numbers) word_indexer.add_and_get_index(word) # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line # of the file to see what the embedding dim is if len(vectors) == 0: vectors.append(np.zeros(vector.shape[0])) vectors.append(np.zeros(vector.shape[0])) vectors.append(vector) f.close() # Turn vectors into a 2-D numpy array return WordEmbeddings(word_indexer, np.array(vectors))
class CharTokenizer: """ Class to create char tokens """ def __init__(self, max_word_length): vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() self.char_vocab_index.add_and_get_index(PAD_TOKEN) # PAD is 0 self.char_vocab_index.add_and_get_index( UNK_TOKEN) # Unknown token is 1 for char in vocab: self.char_vocab_index.add_and_get_index(char) self.max_word_length = max_word_length def convert_words_to_charids(self, words): word_charids = [] for w in words: charids = [] for c in w: charids.append(self.char_vocab_index.index_of(c)) charids = charids[:self.max_word_length] if len(charids) < self.max_word_length: charids.extend([0] * (self.max_word_length - len(charids))) word_charids.append(charids) return word_charids
class CharBaselineReader(nn.Module): """ Baseline QA Model [Architecture] 0) Inputs: passages and questions 1) Embedding Layer: converts words to vectors 2) Context2Query: computes weighted sum of question embeddings for each position in passage. 3) Passage Encoder: LSTM or GRU. 4) Question Encoder: LSTM or GRU. 5) Question Attentive Sum: computes weighted sum of question hidden. 6) Start Position Pointer: computes scores (logits) over passage conditioned on the question vector. 7) End Position Pointer: computes scores (logits) over passage conditioned on the question vector. Args: args: `argparse` object. Inputs: batch: a dictionary containing batched tensors. { 'passages': LongTensor [batch_size, p_len], 'questions': LongTensor [batch_size, q_len], 'start_positions': Not used in `forward`, 'end_positions': Not used in `forward`, } Returns: Logits for start positions and logits for end positions. Tuple: ([batch_size, p_len], [batch_size, p_len]) """ def __init__(self, args): super().__init__() self.args = args self.pad_token_id = args.pad_token_id # Initialize embedding layer (1) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) # Initialize char embedding layer self.char_embedding = nn.Embedding(args.char_vocab_size, args.char_embedding_dim) # Initialize Context2Query (2) self.aligned_att = AlignedAttention(args.embedding_dim, args.char_embedding_dim) rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU # Initialize passage encoder (3) self.passage_rnn = rnn_cell( args.embedding_dim * 2, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) # Initialize question encoder (4) self.question_rnn = rnn_cell( args.embedding_dim, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) self.dropout = nn.Dropout(self.args.dropout) # Adjust hidden dimension if bidirectional RNNs are used _hidden_dim = (args.hidden_dim * 2 if args.bidirectional else args.hidden_dim) # Initialize attention layer for question attentive sum (5) self.question_att = SpanAttention(_hidden_dim) # Initialize bilinear layer for start positions (6) self.start_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize bilinear layer for end positions (7) self.end_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize char indexer vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() for char in vocab: self.char_vocab_index.add_and_get_index(char) def load_pretrained_embeddings(self, vocabulary, path): """ Loads GloVe vectors and initializes the embedding matrix. Args: vocabulary: `Vocabulary` object. path: Embedding path, e.g. "glove/glove.6B.300d.txt". """ if self.args.embedding == 'glove': embedding_map = load_cached_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): if word in embedding_map: #embeddings[i] = torch.tensor(embedding_map[word]) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) else: ##################### # Loads Fasttext embeddings embedding_map = load_fasttext_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): embeddings[i] = torch.tensor( embedding_map.get_word_vector(word)) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) return num_pretrained def sorted_rnn(self, sequences, sequence_lengths, rnn): """ Sorts and packs inputs, then feeds them into RNN. Args: sequences: Input sequences, [batch_size, len, dim]. sequence_lengths: Lengths for each sequence, [batch_size]. rnn: Registered LSTM or GRU. Returns: All hidden states, [batch_size, len, hid]. """ # Sort input sequences sorted_inputs, sorted_sequence_lengths, restoration_indices = _sort_batch_by_length( sequences, sequence_lengths) # Pack input sequences packed_sequence_input = pack_padded_sequence( sorted_inputs, sorted_sequence_lengths.data.long().tolist(), batch_first=True) # Run RNN packed_sequence_output, _ = rnn(packed_sequence_input, None) # Unpack hidden states unpacked_sequence_tensor, _ = pad_packed_sequence( packed_sequence_output, batch_first=True) # Restore the original order in the batch and return all hidden states return unpacked_sequence_tensor.index_select(0, restoration_indices) def forward(self, batch): # Obtain masks and lengths for passage and question. passage_mask = (batch['passages'] != self.pad_token_id ) # [batch_size, p_len] question_mask = (batch['questions'] != self.pad_token_id ) # [batch_size, q_len] passage_lengths = passage_mask.long().sum(-1) # [batch_size] question_lengths = question_mask.long().sum(-1) # [batch_size] # 1) Embedding Layer: Embed the passage and question. passage_embeddings = self.embedding( batch['passages']) # [batch_size, p_len, p_dim] question_embeddings = self.embedding( batch['questions']) # [batch_size, q_len, q_dim] passage_char_embeddings = self.char_embedding( batch['char_passages'] ) # [batch_size, p_len, word_length, word_dim] [64, 168, 16, 64] question_char_embeddings = self.char_embedding( batch['char_questions'] ) # [batch_size, q_len, word_length, word_dim] if self.args.char_embedding_type == 'average': # Average char embeddings baseline passage_char_embeddings_avg = passage_char_embeddings.mean( dim=2).squeeze(0) question_char_embeddings_avg = question_char_embeddings.mean( dim=2).squeeze(0) passage_final_embeddings = torch.cat( [passage_embeddings, passage_char_embeddings_avg], dim=2) question_final_embeddings = torch.cat( [question_embeddings, question_char_embeddings_avg], dim=2) #print('passage_char_embeddings ', passage_char_embeddings.shape) #print('question_char_embeddings ', question_char_embeddings.shape) else: # Conv 1D char embeddings passage_char_embeddings_conv1d_input = passage_char_embeddings.reshape( (-1, passage_char_embeddings.shape[3], passage_char_embeddings.shape[2])) question_char_embeddings_conv1d_input = question_char_embeddings.reshape( (-1, question_char_embeddings.shape[3], question_char_embeddings.shape[2])) conv1d = torch.nn.Conv1d(self.args.char_embedding_dim, self.args.char_embedding_dim, 3) relu = torch.nn.ReLU() if torch.cuda.is_available(): conv1d.cuda() relu.cuda() passage_char_embeddings_tmp1 = relu( conv1d(passage_char_embeddings_conv1d_input)) # Last dimension of conv1d output we want to collapse using global max pooling passage_char_embeddings_final = torch.nn.functional.max_pool1d( passage_char_embeddings_tmp1, passage_char_embeddings_tmp1.shape[2]).squeeze(2).reshape( passage_char_embeddings.shape[0], passage_char_embeddings.shape[1], -1) question_char_embeddings_tmp1 = relu( conv1d(question_char_embeddings_conv1d_input)) # Last dimension of conv1d output we want to collapse using global max pooling question_char_embeddings_final = torch.nn.functional.max_pool1d( question_char_embeddings_tmp1, question_char_embeddings_tmp1.shape[2]).squeeze(2).reshape( question_char_embeddings.shape[0], question_char_embeddings.shape[1], -1) passage_final_embeddings = torch.cat( [passage_embeddings, passage_char_embeddings_final], dim=2) question_final_embeddings = torch.cat( [question_embeddings, question_char_embeddings_final], dim=2) # 2) Context2Query: Compute weighted sum of question embeddings for # each passage word and concatenate with passage embeddings. aligned_scores = self.aligned_att( passage_final_embeddings, question_final_embeddings, ~question_mask) # [batch_size, p_len, q_len] aligned_embeddings = aligned_scores.bmm( question_embeddings) # [batch_size, p_len, q_dim] passage_embeddings = cuda( self.args, torch.cat((passage_embeddings, aligned_embeddings), 2), ) # [batch_size, p_len, p_dim + q_dim] # 3) Passage Encoder passage_hidden = self.sorted_rnn( passage_embeddings, passage_lengths, self.passage_rnn) # [batch_size, p_len, p_hid] passage_hidden = self.dropout( passage_hidden) # [batch_size, p_len, p_hid] # 4) Question Encoder: Encode question embeddings. question_hidden = self.sorted_rnn( question_embeddings, question_lengths, self.question_rnn) # [batch_size, q_len, q_hid] # 5) Question Attentive Sum: Compute weighted sum of question hidden # vectors. question_scores = self.question_att(question_hidden, ~question_mask) question_vector = question_scores.unsqueeze(1).bmm( question_hidden).squeeze(1) question_vector = self.dropout(question_vector) # [batch_size, q_hid] # 6) Start Position Pointer: Compute logits for start positions start_logits = self.start_output(passage_hidden, question_vector, ~passage_mask) # [batch_size, p_len] # 7) End Position Pointer: Compute logits for end positions end_logits = self.end_output(passage_hidden, question_vector, ~passage_mask) # [batch_size, p_len] return start_logits, end_logits # [batch_size, p_len], [batch_size, p_len]
from keras.layers import Dense, Embedding, LSTM, Dropout from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical import re import json import pandas as pd from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer from nltk.tokenize import TweetTokenizer from sklearn.metrics import classification_report include_test = True tknr = TweetTokenizer() indexer = get_indexer('indexer_15_dups.csv') word_indexer = Indexer() word_indexer.add_and_get_index("UNK") train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000] dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200] test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200] X_train = [] Y_train = [] X_dev = [] Y_dev = [] Y_dev_true = [] X_test = [] Y_test = [] Y_test_true = [] for d in train_data:
columns=['mId', 'tmdbId', 'title'], index=False) ''' create genres mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds) Genre2Id: 20 lines, each line includes (gId, genre name) gId ranges from 45843 to 45862 ''' f = open("processed_data/mId2Genre.txt", "w") genreIdx = Indexer() for idx, row in movies.iterrows(): mId, raw_genres = row['mId'], row['genres'] raw_genres = raw_genres.replace("\'", "\"") genres_l = json.loads(raw_genres) f.write("%d %d" % (mId, len(genres_l))) for g in genres_l: f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base)) f.write("\n") f.close() f = open("processed_data/Genre2Id.txt", "w") num_genres = len(genreIdx) for i in range(num_genres): f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i))) f.close() id_base += num_genres ''' create credits mId2CC.txt: 45476 lines each line includes (mId, num of crew/casts, cIds) ''' credits = readCreditData(args, tmid2mid) print("credits.shape %s" % (str(credits.shape)))