def loaddatawithtokenize(i = 0, nb_words = 20000, start_char = 1, oov_char=2, index_from=3, withraw = False, datalen = 500): (traindata,adversarialdata,testdata,numclass) = loaddata(i) rawtrain = traindata.content[:] rawadversarial = adversarialdata.content[:] rawtest = testdata.content[:] tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(traindata.content + testdata.content) adversarialdata.content = tokenizer.texts_to_sequences(adversarialdata.content) traindata.content = tokenizer.texts_to_sequences(traindata.content) testdata.content = tokenizer.texts_to_sequences(testdata.content) if start_char==None: adversarialdata.content = [[w + index_from for w in x] for x in adversarialdata.content] traindata.content = [[w + index_from for w in x] for x in traindata.content] testdata.content = [[w + index_from for w in x] for x in testdata.content] else: adversarialdata.content = [[start_char]+[w + index_from for w in x] for x in adversarialdata.content] traindata.content = [[start_char]+[w + index_from for w in x] for x in traindata.content] testdata.content = [[start_char]+[w + index_from for w in x] for x in testdata.content] adversarialdata.content = [[w if w < nb_words else oov_char for w in x] for x in adversarialdata.content] traindata.content = [[w if w < nb_words else oov_char for w in x] for x in traindata.content] testdata.content = [[w if w < nb_words else oov_char for w in x] for x in testdata.content] adversarialdata.content = pad_sequences(adversarialdata.content, maxlen=datalen) traindata.content = pad_sequences(traindata.content, maxlen=datalen) testdata.content = pad_sequences(testdata.content, maxlen=datalen) if withraw: return traindata,adversarialdata,testdata,tokenizer,numclass,rawtrain,rawadversarial,rawtest else: return traindata,adversarialdata,testdata,tokenizer,numclass
def predict_splitter(X,batchsize): X_bucket=[] data_set = [[] for _ in _buckets] tot_index=np.zeros(0) for i,x in enumerate(X): for b_id, _ in enumerate(_buckets): if len(x)<=_: data_set[b_id].append(i) break for b_id, _ in enumerate(data_set): if len(data_set[b_id])==0: continue batches=len(_) x_index=np.array(data_set[b_id]) tot_index=np.concatenate([tot_index,x_index],axis=0) index=np.arange(batches) X_prime=X[x_index[index]] orf_prime=batchfindORF(X_prime) orf_prime=preprocessing.pad_sequences(orf_prime,maxlen=_buckets[b_id]) orf_prime=(np.arange(orf_prime.max()+1) == orf_prime[:,:,None]).astype(dtype='float32') orf_prime=np.delete(orf_prime,0,axis=-1) X_prime=preprocessing.pad_sequences(X_prime,maxlen=_buckets[b_id]) X_prime=(np.arange(X_prime.max()+1) == X_prime[:,:,None]).astype(dtype='float32') #one_hot X_prime=np.delete(X_prime,0,axis=-1) X_bucket.append([X_prime,orf_prime]) return X_bucket,tot_index
def bucket_generator_ORF(X, Y, batchsize): data_set = [[] for _ in _buckets] for i, x in enumerate(X): for b_id, _ in enumerate(_buckets): if len(x) <= _: data_set[b_id].append(i) break k = 0 len_set = [ int(len(_) / batchsize) + ceil((len(_) % batchsize) / batchsize) for _ in data_set ] tot_batch = sum(len_set) while (1): if k % tot_batch == 0: k = 0 shuffled_batch = np.arange(tot_batch) #batches shuffle shuffled_data_set = [] for data in data_set: b_data = data shuffle(b_data) shuffled_data_set.append(b_data) #bucket shuffle np.random.shuffle(shuffled_batch) cur_batch = shuffled_batch[k] for s_i, l_b in enumerate(len_set): if cur_batch < l_b: batch_index = np.array( shuffled_data_set[s_i][cur_batch * batchsize:(cur_batch + 1) * batchsize]) X_batch = X[batch_index] orf_batch = batchfindORF(X_batch) orf_batch = preprocessing.pad_sequences(orf_batch, maxlen=_buckets[s_i]) orf_batch = (np.arange(orf_batch.max() + 1) == orf_batch[:, :, None]).astype( dtype='float32') orf_batch = np.delete(orf_batch, 0, axis=-1) X_batch = preprocessing.pad_sequences(X_batch, maxlen=_buckets[s_i]) X_batch = (np.arange(X_batch.max() + 1) == X_batch[:, :, None]).astype( dtype='float32') X_batch = np.delete(X_batch, 0, axis=-1) Y_batch = Y[batch_index] #Y_batch=(np.arange(2)==Y_batch).astype(dtype='float32') yield [X_batch, orf_batch], Y_batch break else: cur_batch -= l_b k += 1
def findORF(seq): orflen=0 orf="" o_s=0 o_e=0 length=len(seq) seq=[seq] seq=preprocessing.pad_sequences(seq,maxlen=length,padding='post') seq=(np.arange(seq.max()+1) == seq[:,:,None]).astype(dtype='float32') seq=np.delete(seq,0,axis=-1) if seq.shape[2]==3: zeros_col = np.zeros((seq.shape[0],seq.shape[1],1)) seq = np.concatenate((seq,zeros_col),axis=2) for frame in range(3): tseq=stopmodel.predict(seq[:,frame:])[:,:(length-frame)//3] tseq=np.argmax(tseq,axis=-1)-1 sseq=np.append(-1,np.where(tseq==1)[1]) sseq=np.append(sseq,tseq.shape[1]) lseq=np.diff(sseq)-1 flenp=np.argmax(lseq) flen=lseq[flenp] n_s=frame+3*sseq[flenp]+3 n_e=frame+3*sseq[flenp+1] if flen>orflen or ((orflen==flen) and n_s<o_s): orflen=flen o_s=n_s o_e=n_e return o_s,o_e
def __init__(self, opts): if opts.task == 'POS_models': data_dir = 'data/pos_data' self.kfold = False elif opts.jackknife: data_dir = 'data/super_data' jk_data_dir = 'data/pos_data' path_to_k_fold = os.path.join(jk_data_dir, 'train_y.txt') path_to_k_fold_test = os.path.join(jk_data_dir, 'test_y.txt') self.kfold = True else: data_dir = 'data/super_data' self.kfold = False path_to_text = os.path.join(data_dir, 'train_x.txt') path_to_text_test = os.path.join(data_dir, 'test_x.txt') path_to_POS = os.path.join(data_dir, 'train_y.txt') path_to_POS_test = os.path.join(data_dir, 'test_y.txt') self.MAX_NB_WORDS = 200000000000 # first, build index mapping words in the embeddings set # to their embedding vector f_train = open(path_to_text) f_test = open(path_to_text_test) texts = f_train.readlines() nb_train_samples = len(texts) self.nb_train_samples = nb_train_samples texts = texts + f_test.readlines() f_train.close() f_test.close() print('length', len(texts)) f_train.close() # f_test.close() # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) indicator = tokenizer.cap_indicator(texts) num_indicator = tokenizer.num_indicator(texts) suffix = tokenizer.suffix_extract(texts) suffix_tokenizer = Tokenizer() suffix_tokenizer.fit_on_texts(suffix, non_split=True) suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True) # debugging # for i in xrange(len(sequences)): # assert len(sequences[i]) == len(suffix_sequences[i]) word_index = tokenizer.word_index self.word_index = word_index suffix_index = suffix_tokenizer.word_index print('Found %s unique words.' % len(word_index)) data = pad_sequences(sequences, opts, True) suffix_data = pad_sequences(suffix_sequences, opts) cap_indicator = pad_sequences(indicator, opts) num_indicator = pad_sequences(num_indicator, opts) f_train = open(path_to_POS) f_test = open(path_to_POS_test) texts = f_train.readlines() + f_test.readlines() f_train.close() f_test.close() lab_tokenizer = Tokenizer() lab_tokenizer.fit_on_texts(texts) lab_sequences = lab_tokenizer.texts_to_sequences(texts) tag_index = lab_tokenizer.word_index self.tag_index = tag_index self.tag_size = len(tag_index) print('Found %s unique tags.' % len(tag_index)) labels = pad_sequences(lab_sequences, opts) #labels = np.expand_dims(labels, -1) do not need it for tensorflow if opts.jackknife: f_train = open(path_to_k_fold) f_test = open(path_to_k_fold_test) texts = f_train.readlines() + f_test.readlines() f_train.close() f_test.close() jk_tokenizer = Tokenizer() jk_tokenizer.fit_on_texts(texts) jk_sequences = jk_tokenizer.texts_to_sequences(texts) jk_index = jk_tokenizer.word_index self.jk_index = jk_index self.jk_size = len(jk_index) print('Found %s unique jackknife tags.' % len(jk_index)) jk_labels = pad_sequences(jk_sequences, opts) indices = np.arange(nb_train_samples) np.random.shuffle(indices) nb_validation_samples = data.shape[0] - nb_train_samples self.nb_validation_samples = nb_validation_samples ### define zero matrix first for splitting seq_length = labels.shape[1] if opts.attention in [100, 101, 102, 103]: self.nb_train_added = nb_train_samples // 10 * 10 + 10 self.nb_validation_added = nb_validation_samples // 10 * 10 + 10 else: self.nb_train_added = nb_train_samples self.nb_validation_added = nb_validation_samples self.X_train = np.zeros([self.nb_train_added, seq_length]) self.X_train[:nb_train_samples] = data[:-nb_validation_samples][ indices] if opts.jackknife: self.jk_labels = np.zeros([self.nb_train_added, seq_length]) self.jk_labels[:nb_train_samples] = jk_labels[indices] self.jk_labels_test = np.zeros( [self.nb_validation_added, seq_length]) self.jk_labels_test[:nb_validation_samples] = jk_labels[ -nb_validation_samples:] self.train_cap_indicator = np.zeros([self.nb_train_added, seq_length]) self.train_cap_indicator[: nb_train_samples] = cap_indicator[: -nb_validation_samples][ indices] self.train_num_indicator = np.zeros([self.nb_train_added, seq_length]) self.train_num_indicator[: nb_train_samples] = num_indicator[: -nb_validation_samples][ indices] self.suffix_train = np.zeros([self.nb_train_added, seq_length]) self.suffix_train[: nb_train_samples] = suffix_data[: -nb_validation_samples][ indices] self.y_train = np.zeros([self.nb_train_added, seq_length]) self.y_train[:nb_train_samples] = labels[:-nb_validation_samples][ indices] if opts.joint: self.pos_train = self.jk_labels self.X_test = np.zeros([self.nb_validation_added, seq_length]) self.X_test[:nb_validation_samples] = data[-nb_validation_samples:] self.test_cap_indicator = np.zeros( [self.nb_validation_added, seq_length]) self.test_cap_indicator[:nb_validation_samples] = cap_indicator[ -nb_validation_samples:] self.test_num_indicator = np.zeros( [self.nb_validation_added, seq_length]) self.test_num_indicator[:nb_validation_samples] = num_indicator[ -nb_validation_samples:] self.suffix_test = np.zeros([self.nb_validation_added, seq_length]) self.suffix_test[:nb_validation_samples] = suffix_data[ -nb_validation_samples:] self.y_test = np.zeros([self.nb_validation_added, seq_length]) self.y_test[:nb_validation_samples] = labels[-nb_validation_samples:] if opts.joint: self.pos_test = self.jk_labels_test if opts.jackknife: K = 10 #k_fold_samples = nb_train_samples//K*K samples_per_group = (nb_train_samples // K) + 1 print('splitting into {} folds'.format(K)) ## don't get rid of the remainders. We will save all of them # self.X_train = self.X_train[:k_fold_samples] # get rid of the remaining examples for kfold # self.train_cap_indicator = self.train_cap_indicator[:k_fold_samples] # self.train_num_indicator = self.train_num_indicator[:k_fold_samples] # self.suffix_train = self.suffix_train[:k_fold_samples] # # self.X_train_k_fold = np.split(self.X_train[:k_fold_samples], K) # self.train_cap_indicator_k_fold = np.split(self.train_cap_indicator[:k_fold_samples], K) # self.train_num_indicator_k_fold = np.split(self.train_num_indicator[:k_fold_samples], K) # self.suffix_train_k_fold = np.split(self.suffix_train[:k_fold_samples], K) # self.y_train_k_fold = np.split(self.jk_labels[:k_fold_samples], K) # # adding everything back self.X_train_k_fold = [] self.train_cap_indicator_k_fold = [] self.train_num_indicator_k_fold = [] self.suffix_train_k_fold = [] self.y_train_k_fold = [] for k in xrange(K): self.X_train_k_fold.append( self.X_train[samples_per_group * k:samples_per_group * (k + 1)]) self.train_cap_indicator_k_fold.append( self.train_cap_indicator[samples_per_group * k:samples_per_group * (k + 1)]) self.train_num_indicator_k_fold.append( self.train_num_indicator[samples_per_group * k:samples_per_group * (k + 1)]) self.suffix_train_k_fold.append( self.suffix_train[samples_per_group * k:samples_per_group * (k + 1)]) self.y_train_k_fold.append( self.jk_labels[samples_per_group * k:samples_per_group * (k + 1)]) #if opts.joint: # self.pos_train = self.pos_train[:k_fold_samples] print('end splitting') self.nb_suffix = len(suffix_index) self.suffix_embedding_mat = np.random.randn(self.nb_suffix + 1, 10) self.nb_words = min(self.MAX_NB_WORDS, len(word_index)) ## cond entropy self.cond_matrix = np.ones((self.nb_words + 1, 1)) with open('certain.pkl') as fhand: certain = pickle.load(fhand) self.certain_words = [] for certain_word in certain: self.certain_words.append(self.word_index[certain_word]) for certain_word in self.certain_words: self.cond_matrix[certain_word] = 0.0 ### cond entropy ends if opts.embedding_name == 'random': np.random.seed(opts.seed) self.embedding_matrix = np.random.uniform( -2, 2, size=(self.nb_words + 1, opts.embedding_dim)) elif opts.embedding_name == 'word2vec': if not opts.embedding_dim == 300: # word2vec is of 300 dim sys.exit('error in dim') filename = os.path.join('../word2vec', 'GoogleNews-vectors-negative300.bin') import gensim self.embedding_matrix = np.zeros( (self.nb_words + 1, opts.embedding_dim)) self.word2vec_model = gensim.models.word2vec.Word2Vec.load_word2vec_format( filename, binary=True) print('Found %s word vectors.' % len(self.word2vec_model.vocab)) for word, i in word_index.items(): if i > self.MAX_NB_WORDS and word in self.word2vec_model.vocab: self.embedding_matrix[i] = self.word2vec_model[word] else: self.embeddings_index = {} print('Indexing word vectors.') f = open(opts.embedding_name) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(self.embeddings_index)) self.embedding_matrix = np.zeros( (self.nb_words + 1, opts.embedding_dim)) for word, i in word_index.items(): if i > self.MAX_NB_WORDS: continue embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. if not self.embedding_matrix.shape[1] == len( embedding_vector): sys.exit('error in dim') self.embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer self._index_in_epoch = 0 self._num_examples = self.X_train.shape[0] self._num_test_examples = self.X_test.shape[0] self._epoch_completed = 0 self._index_in_test = 0 if opts.jackknife: # self._num_hold_in_examples = self.X_train_k_fold[0].shape[0]*(K-1) # self._num_hold_out_examples = self.X_train_k_fold[0].shape[0] self.k = 0 self.opts = opts
train_label = train_data_df.iloc[:, -1].values dev_data = dev_data_df.iloc[:, -2].values dev_label = dev_data_df.iloc[:, -1].values test_data = test_data_df.iloc[:, -1].values # 获取词典与词向量 pretrained_embedding_file_path = base_path + "/glove/glove.6B.50d.txt" word2idx, embedding_matrix = load_pretrained_embedding( pretrained_embedding_file_path=pretrained_embedding_file_path) # 文本向量化 train_data = texts_convert_to_ids(train_data, word2idx) dev_data = texts_convert_to_ids(dev_data, word2idx) test_data = texts_convert_to_ids(test_data, word2idx) train_data = torch.from_numpy(pad_sequences(train_data)) dev_data = torch.from_numpy(pad_sequences(dev_data)) test_data = torch.from_numpy(pad_sequences(test_data)) # 产生batch data class my_dataset(Dataset): def __init__(self, data, label): self.data = data self.label = label def __len__(self): return len(self.data) def __getitem__(self, item): item_data = self.data[item]
print('Epoch {}'.format(epoch)) training_loss = 0.0 predicted_values_train = [] true_values_train = [] predicted_values_test = [] true_values_test = [] for batch_idx, ex in enumerate(train_data): tokenized_tweets = tokenize_tweets2(ex['tokenized_tweet'], text_preprocessor) indexed_labels = [labels_dict[i] for i in ex['label']] tensor_dictionary = pad_sequences(indexed_labels, tokenized_tweets, dictionary) data = tensor_dictionary['tweet_tensor'].cuda() label = tensor_dictionary['label_tensor'].cuda() model.cuda().train() predictions = model(data, tensor_dictionary['length']) predictions = F.log_softmax(predictions, dim=1) #indexed_labels = torch.LongTensor([labels_dict[i] for i in ex['label']]).cuda() loss = F.nll_loss(predictions, label) training_loss += loss.data # optimizer = optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001, momentum=0.9, nesterov = True) optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
def __init__(self, opts, test_opts=None): path_to_text = opts.text_train path_to_tag = opts.tag_train path_to_jk = opts.jk_train if test_opts is None: path_to_text_test = opts.text_test path_to_tag_test = opts.tag_test path_to_jk_test = opts.jk_test else: path_to_text_test = test_opts.text_test path_to_tag_test = test_opts.tag_test path_to_jk_test = test_opts.jk_test self.inputs_train = {} self.inputs_test = {} ## indexing sents files f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() self.nb_train_samples = len(texts) f_train.close() tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(texts) #print(tokenizer.word_index['-unseen-']) self.word_index = tokenizer.word_index sorted_freqs = tokenizer.sorted_freqs self.nb_words = len(self.word_index) print('Found {} unique lowercased words including -unseen-.'.format(self.nb_words)) # lookup the glove word embeddings # need to reserve indices for testing file. glove_size = opts.embedding_dim self.embeddings_index = {} print('Indexing word vectors.') #f = open('glovevector/glove.6B.{}d.txt'.format(glove_size)) f = io.open(opts.word_embeddings_file, encoding='utf-8') for line in f: values = line.strip().split(' ') if len(values) == opts.embedding_dim+1: word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found {} word vectors.'.format(len(self.embeddings_index))) unseens = list(set(self.embeddings_index.keys()) - set(self.word_index.keys())) ## list of words that appear in glove but not in the training set nb_unseens = len(unseens) print('Found {} words not in the training set'.format(nb_unseens)) self.word_embeddings = np.zeros((self.nb_words+1+nb_unseens, glove_size)) ## +1 for padding (idx 0) ## Get Frequencies for Adversarial Training (Yasunaga et al. 2017) self.word_freqs = np.zeros([self.nb_words+1+nb_unseens]) self.word_freqs[1:self.nb_words] = sorted_freqs ## Skip Zero Padding (Index 0) self.word_freqs = self.word_freqs.astype(np.float32) self.word_freqs = self.word_freqs/np.sum(self.word_freqs) for word, i in self.word_index.items(): ## first index the words in the training set embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: ## otherwise zero vector self.word_embeddings[i] = embedding_vector for unseen in unseens: self.word_index[unseen] = len(self.word_index) + 1 ## add unseen words to the word_index dictionary self.word_embeddings[self.word_index[unseen]] = self.embeddings_index[unseen] self.idx_to_word = invert_dict(self.word_index) print('end glove indexing') f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() self.nb_validation_samples = len(texts) - self.nb_train_samples f_test.close() text_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples])) self.inputs_train['words'] = text_sequences[:self.nb_train_samples] self.inputs_test['words'] = text_sequences[self.nb_train_samples:] ## indexing sents files ends ## indexing suffixes if opts.suffix_dim > 0: suffix = tokenizer.suffix_extract(texts) suffix_tokenizer = Tokenizer() suffix_tokenizer.fit_on_texts(suffix[:self.nb_train_samples], non_split=True) self.suffix_index = suffix_tokenizer.word_index self.nb_suffixes = len(self.suffix_index) sorted_freqs = suffix_tokenizer.sorted_freqs self.suffix_freqs = np.zeros([self.nb_suffixes+1]).astype(np.float32) ## +1 for zero padding self.suffix_freqs[1:self.nb_suffixes] = sorted_freqs ## Skip Zero Padding (Index 0) self.suffix_freqs = self.suffix_freqs/np.sum(self.suffix_freqs) self.idx_to_suffix = invert_dict(self.suffix_index) print('Found {} unique suffixes including -unseen-.'.format(self.nb_suffixes)) suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True) #print(map(lambda x: self.idx_to_suffix[x], suffix_sequences[self.nb_train_samples])) self.inputs_train['suffix'] = suffix_sequences[:self.nb_train_samples] self.inputs_test['suffix'] = suffix_sequences[self.nb_train_samples:] ## indexing suffixes ends ## indexing capitalization if opts.cap: cap_sequences = tokenizer.cap_indicator(texts) #print(cap_sequences[self.nb_train_samples]) self.inputs_train['cap'] = cap_sequences[:self.nb_train_samples] self.inputs_test['cap'] = cap_sequences[self.nb_train_samples:] ## indexing capitalization ends ## indexing numbers if opts.num: num_sequences = tokenizer.num_indicator(texts) #print(num_sequences[self.nb_train_samples]) self.inputs_train['num'] = num_sequences[:self.nb_train_samples] self.inputs_test['num'] = num_sequences[self.nb_train_samples:] ## indexing numbers ends ## indexing jackknife files if opts.jk_dim > 0: f_train = io.open(path_to_jk, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts) self.jk_index = tokenizer.word_index self.nb_jk = len(self.jk_index) sorted_freqs = tokenizer.sorted_freqs self.jk_freqs = np.zeros([self.nb_jk+1]).astype(np.float32) ## +1 for zero padding self.jk_freqs[1:self.nb_jk] = sorted_freqs ## Skip Zero Padding (Index 0) self.jk_freqs = self.jk_freqs/np.sum(self.jk_freqs) self.idx_to_jk = invert_dict(self.jk_index) print('Found {} unique tags including -unseen-.'.format(self.nb_jk)) f_test = io.open(path_to_jk_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() jk_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples] self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:] ## indexing jackknife files ends ## indexing char files if opts.chars_dim > 0: f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False,char_encoding=True) tokenizer.fit_on_texts(texts) self.char_index = tokenizer.word_index self.nb_chars = len(self.char_index) sorted_freqs = tokenizer.sorted_freqs self.char_freqs = np.zeros([self.nb_chars+1]).astype(np.float32) ## +1 for zero padding self.char_freqs[1:self.nb_chars] = sorted_freqs ## Skip Zero Padding (Index 0) self.char_freqs = self.char_freqs/np.sum(self.char_freqs) self.idx_to_char = invert_dict(self.char_index) print('Found {} unique characters including -unseen-.'.format(self.nb_chars)) f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() char_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['chars'] = char_sequences[:self.nb_train_samples] self.inputs_test['chars'] = char_sequences[self.nb_train_samples:] ## indexing char files ends ## indexing stag files f_train = open(path_to_tag) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) ## for tCO tokenizer.fit_on_texts(texts, zero_padding=False) #print(tokenizer.word_index['-unseen-']) self.tag_index = tokenizer.word_index self.nb_tags = len(self.tag_index) self.idx_to_tag = invert_dict(self.tag_index) print('Found {} unique tags including -unseen-.'.format(self.nb_tags)) f_test = open(path_to_tag_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() tag_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['tags'] = tag_sequences[:self.nb_train_samples] self.inputs_test['tags'] = tag_sequences[self.nb_train_samples:] ## indexing stag files ends self.test_gold = np.hstack(tag_sequences[self.nb_train_samples:]) ## for calculation of accuracy ## padding the train inputs and test inputs #self.inputs_train = [pad_sequences(x) for x in self.inputs_train] self.inputs_train = {key: pad_sequences(x, key) for key, x in self.inputs_train.items()} random.seed(0) perm = np.arange(self.nb_train_samples) random.shuffle(perm) self.inputs_train = {key: x[perm] for key, x in self.inputs_train.items()} #self.inputs_train = [x[perm] for x in self.inputs_train] #self.inputs_test = [pad_sequences(x) for x in self.inputs_test] self.inputs_test = {key: pad_sequences(x, key) for key, x in self.inputs_test.items()} ## setting the current indices self._index_in_epoch = 0 self._epoch_completed = 0 self._index_in_test = 0
trained_model = evaluate_model.compile_train(nmt_model, train_english_input, train_german_output, test_english_input, test_german_output) #evaluate model evaluate_model.model_speech_evaluation(trained_model, german_tokenizer, train_english_input, train, role='Train') evaluate_model.model_speech_evaluation(trained_model, german_tokenizer, test_english_input, test, role='Test') ''' #------------------------------------------------------------------------------------------------------------------------------------> GERMAN TO ENGLISH #prepare train data train_english_output = preprocessing.encode_sequences( english_tokenizer, train[:, 0]) #print(train_english_input) train_english_output = preprocessing.pad_sequences( english_max_sentence_length, train_english_output) #print(train_english_input) train_german_input = preprocessing.encode_sequences( german_tokenizer, train[:, 1]) train_german_input = preprocessing.pad_sequences( german_max_sentence_length, train_german_input) #make the target as an one hot encoding #train_english_output = preprocessing.oneHotEncoding(train_english_output, english_vocabulary_size) #and one for english #train_english_output = oneHotEncoding(train_english_input, english_vocabulary_size) #print(train_german_output) #print(train_german_output[0].shape)
def __init__(self, opts, test_opts=None): path_to_text = opts.text_train path_to_tag = opts.tag_train path_to_jk = opts.jk_train path_to_arc = opts.arc_train path_to_rel = opts.rel_train if test_opts is None: path_to_text_test = opts.text_test path_to_tag_test = opts.tag_test path_to_jk_test = opts.jk_test path_to_arc_test = opts.arc_test path_to_rel_test = opts.rel_test path_to_punc_test = opts.punc_test else: path_to_text_test = test_opts.text_test path_to_tag_test = test_opts.tag_test path_to_jk_test = test_opts.jk_test path_to_arc_test = test_opts.arc_test path_to_rel_test = test_opts.rel_test path_to_punc_test = test_opts.punc_test self.inputs_train = {} self.inputs_test = {} ## indexing sents files f_train = open(path_to_text) texts = f_train.readlines() self.nb_train_samples = len(texts) f_train.close() tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(texts) #print(tokenizer.word_index['-unseen-']) self.word_index = tokenizer.word_index self.nb_words = len(self.word_index) print( 'Found {} unique lowercased words including -unseen- and <-root->.' .format(self.nb_words)) # lookup the glove word embeddings # need to reserve indices for testing file. glove_size = opts.embedding_dim self.embeddings_index = {} print('Indexing word vectors.') f = open(opts.word_embeddings_file) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found {} word vectors.'.format(len(self.embeddings_index))) unseens = list( set(self.embeddings_index.keys()) - set(self.word_index.keys()) ) ## list of words that appear in glove but not in the training set nb_unseens = len(unseens) print('Found {} words not in the training set but in the glove data'. format(nb_unseens)) self.word_embeddings = np.zeros( (self.nb_words + 1 + nb_unseens, glove_size)) ## +1 for padding (idx 0) for word, i in self.word_index.items( ): ## first index the words in the training set embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: ## otherwise zero vector self.word_embeddings[i] = embedding_vector for unseen in unseens: self.word_index[unseen] = len( self.word_index ) + 1 ## add unseen words to the word_index dictionary self.word_embeddings[ self.word_index[unseen]] = self.embeddings_index[unseen] self.idx_to_word = invert_dict(self.word_index) print('end glove indexing') f_test = open(path_to_text_test) texts = texts + f_test.readlines() self.nb_validation_samples = len(texts) - self.nb_train_samples f_test.close() text_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples])) self.inputs_train['words'] = text_sequences[:self.nb_train_samples] self.inputs_test['words'] = text_sequences[self.nb_train_samples:] ## indexing sents files ends ## indexing char files if opts.chars_dim > 0: f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False, char_encoding=True, root=False) ## char embedding for <-root-> does not make sense tokenizer.fit_on_texts( texts) ## char embedding for <-root-> does not make sense self.char_index = tokenizer.word_index self.nb_chars = len(self.char_index) self.idx_to_char = invert_dict(self.char_index) print( 'Found {} unique characters including -unseen-. NOT including <-root->.' .format(self.nb_chars)) f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() char_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['chars'] = char_sequences[:self.nb_train_samples] self.inputs_test['chars'] = char_sequences[self.nb_train_samples:] ## indexing char files ends ## indexing jackknife files if (opts.jk_dim > 0) or (opts.model in ['Parsing_Model_Joint_Both']): f_train = open(path_to_jk) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts, zero_padding=False) self.jk_index = tokenizer.word_index self.nb_jk = len(self.jk_index) self.idx_to_jk = invert_dict(self.jk_index) print('Found {} unique POS tags including -unseen- and <-root->.'. format(self.nb_jk)) f_test = open(path_to_jk_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() jk_sequences = tokenizer.texts_to_sequences(texts) self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples] self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:] self.gold_jk = np.hstack( map(lambda x: x[1:], jk_sequences[self.nb_train_samples:])) ## indexing jackknife files ends ## indexing stag files if (opts.stag_dim > 0) or (opts.model in [ 'Parsing_Model_Joint', 'Parsing_Model_Shuffle', 'Parsing_Model_Joint_Both' ]): f_train = open(path_to_tag) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) ## for tCO tokenizer.fit_on_texts(texts, zero_padding=False) ## if zero_padding is True, index 0 is reserved, never assigned to an existing word self.tag_index = tokenizer.word_index self.nb_stags = len(self.tag_index) self.idx_to_tag = invert_dict(self.tag_index) print('Found {} unique supertags including -unseen- and <-root->.'. format(self.nb_stags)) f_test = open(path_to_tag_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() tag_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['stags'] = tag_sequences[:self.nb_train_samples] self.inputs_test['stags'] = tag_sequences[self.nb_train_samples:] self.gold_stags = np.hstack( map(lambda x: x[1:], tag_sequences[self.nb_train_samples:])) ## indexing stag files ends ## indexing rel files f_train = open(path_to_rel) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts, zero_padding=False) self.rel_index = tokenizer.word_index self.nb_rels = len(self.rel_index) self.idx_to_rel = invert_dict(self.rel_index) print( 'Found {} unique rels including -unseen-, NOT including <-root->.'. format(self.nb_rels)) f_test = open(path_to_rel_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() rel_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['rels'] = rel_sequences[:self.nb_train_samples] self.inputs_test['rels'] = rel_sequences[self.nb_train_samples:] self.gold_rels = np.hstack( map(lambda x: x[1:], rel_sequences[self.nb_train_samples:])) ## indexing rel files ends ## indexing arc files ## Notice arc sequences are already integers f_train = open(path_to_arc) arc_sequences = f_train.readlines() f_train.close() f_test = open(path_to_arc_test) arc_sequences = arcs2seq(arc_sequences + f_test.readlines()) f_test.close() self.inputs_train['arcs'] = arc_sequences[:self.nb_train_samples] self.inputs_test['arcs'] = arc_sequences[self.nb_train_samples:] ## indexing arc files ends self.gold_arcs = np.hstack(arc_sequences[self.nb_train_samples:]) if path_to_punc_test is not None: self.punc = arc_sequences[self.nb_train_samples:] with open(path_to_punc_test) as fhand: for sent_idx, line in zip(xrange(len(self.punc)), fhand): self.punc[sent_idx] = [ True for _ in xrange(len(self.punc[sent_idx])) ] for punc_idx in map(int, line.split()): self.punc[sent_idx][punc_idx - 1] = False self.punc = np.hstack(self.punc) #.astype(bool) ## padding the train inputs and test inputs self.inputs_train = { key: pad_sequences(x, key) for key, x in self.inputs_train.items() } self.inputs_train['arcs'] = np.hstack([ np.zeros([self.inputs_train['arcs'].shape[0], 1]).astype(int), self.inputs_train['arcs'] ]) ## dummy parents for the roots random.seed(0) perm = np.arange(self.nb_train_samples) random.shuffle(perm) self.inputs_train = { key: x[perm] for key, x in self.inputs_train.items() } self.inputs_test = { key: pad_sequences(x, key) for key, x in self.inputs_test.items() } ## dummy parents for the roots self.inputs_test['arcs'] = np.hstack([ np.zeros([self.inputs_test['arcs'].shape[0], 1]).astype(int), self.inputs_test['arcs'] ]) ## padding ends ## setting the current indices self._index_in_epoch = 0 self._epoch_completed = 0 self._index_in_test = 0