def __init__(self, dataset, tokenizer: Tokenizer, vocab_size=5000): self.tokenizer = tokenizer # 데이터 로딩 with open(dataset, 'r', encoding='utf-8') as f: self.data = json.load(f) # 데이터 전처리 (str to int) for i, d in enumerate(self.data): self.data[i]['content'] = tokenizer.tokens_to_ids(d['content']) # masking을 위한 토큰 클래스 로딩 self.total_tokens = tokenizer.get_tokens( vocab_prefix=f'vocab_{vocab_size}', for_masking=True)
def load_dataset(args): suffix = f'v{args.vocab_size}_t{args.max_num_tokens}' tokenizer = Tokenizer(tokenizer_name=args.tokenizer_name, prefix='vocab_{}'.format(args.vocab_size)) dataset = BertClsDataset(f'bertcls_test_{suffix}.json', tokenizer=tokenizer, inference=True) print('dataset#:', len(dataset)) def collate(batch): if args.seq_ensemble: # batch_size 1 가정 docs = [] item = batch[0] if len(item[0]) > args.max_num_seq: for i in range(0, len(item[0]) - args.max_num_seq, 5): docs.append(item[0][i:i + args.max_num_seq]) else: docs.append(item[0][:args.max_num_seq]) else: docs = [item[0][:args.max_num_seq] for item in batch] labels = [item[1] for item in batch] return [docs, labels] test_data_loader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collate) return test_data_loader
def __init__(self, dataset, tokenizer: Tokenizer, max_num_seq=20, inference=False, vocab_size=5000, is_train=True): self.max_num_seq = max_num_seq self.inference = inference self.is_train = is_train self.tokenizer = tokenizer self.total_tokens = tokenizer.get_tokens( vocab_prefix=f'vocab_{vocab_size}', for_masking=True) # 데이터 로딩 with open(dataset, 'r', encoding='utf-8') as f: self.data = json.load(f) # 데이터 전처리 (str to int) for i, d in enumerate(self.data): doc = d['content'] n_doc = [] for sub_doc in doc: n_doc.append(self.tokenizer.tokens_to_ids(sub_doc)) # n_doc.append(list(map(self.tokenizer.PieceToId, sub_doc.split()))) self.data[i]['content'] = n_doc
def __init__(self, lang='english'): self.prep_items = [ PrepLowerCase(), Tokenizer(word_tokenize), FilterByRegex(only_letters_pattern(lang=lang)) ] self.construct_mapping()
def loaddatawithtokenize(i = 0, nb_words = 20000, start_char = 1, oov_char=2, index_from=3, withraw = False, datalen = 500): (traindata,adversarialdata,testdata,numclass) = loaddata(i) rawtrain = traindata.content[:] rawadversarial = adversarialdata.content[:] rawtest = testdata.content[:] tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(traindata.content + testdata.content) adversarialdata.content = tokenizer.texts_to_sequences(adversarialdata.content) traindata.content = tokenizer.texts_to_sequences(traindata.content) testdata.content = tokenizer.texts_to_sequences(testdata.content) if start_char==None: adversarialdata.content = [[w + index_from for w in x] for x in adversarialdata.content] traindata.content = [[w + index_from for w in x] for x in traindata.content] testdata.content = [[w + index_from for w in x] for x in testdata.content] else: adversarialdata.content = [[start_char]+[w + index_from for w in x] for x in adversarialdata.content] traindata.content = [[start_char]+[w + index_from for w in x] for x in traindata.content] testdata.content = [[start_char]+[w + index_from for w in x] for x in testdata.content] adversarialdata.content = [[w if w < nb_words else oov_char for w in x] for x in adversarialdata.content] traindata.content = [[w if w < nb_words else oov_char for w in x] for x in traindata.content] testdata.content = [[w if w < nb_words else oov_char for w in x] for x in testdata.content] adversarialdata.content = pad_sequences(adversarialdata.content, maxlen=datalen) traindata.content = pad_sequences(traindata.content, maxlen=datalen) testdata.content = pad_sequences(testdata.content, maxlen=datalen) if withraw: return traindata,adversarialdata,testdata,tokenizer,numclass,rawtrain,rawadversarial,rawtest else: return traindata,adversarialdata,testdata,tokenizer,numclass
def __init__(self, opts): f_train = open(opts.data_dir, 'r') tokenizer = Tokenizer(opts) tokenizer.fit_on_texts(f_train) f_train.close() self.count = tokenizer.count reverse_dictionary = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys())) print('Most common words (+UNK)', self.count[:5]) self.unigrams = [x[1] for x in self.count] self.dictionary = tokenizer.word_index with open('../word2vec_models/{0}/{1}dictionary.pkl'.format(os.path.basename(opts.data_dir), opts.vocab_size), 'wb') as fhand: pickle.dump(self.dictionary, fhand) self.reverse_dictionary = reverse_dictionary self.num_words = sum([count for _, count in tokenizer.word_counts.items()]) num_centers = self.num_words - 2*opts.window_size num_examples = num_centers*opts.window_size*2 self.num_examples = num_examples//opts.batch_size*opts.batch_size self.num_batches = self.num_examples/opts.batch_size self._epoch_completed = 0 self._example_index = 0 self._batch_index = 0 self._window_index = 0 self._relative_window_index = -opts.window_size self.center = opts.window_size self._index_in_epoch = 0 self.tokenizer = tokenizer #self.window = self.tokenizer.next_window() self.opts = opts ## starting data partition if opts.data_dir == '../data/clean_train': self.num_parts = 100 # the entire enwiki is too huge else: self.num_parts = 4 self.num_per_partition = self.num_words//self.num_parts self.num_remainders = self.num_words - self.num_per_partition*(self.num_parts-1) self.data_partition = self.tokenizer.texts_to_sequences_partition(self.num_per_partition) self._partition_index = 0
if not os.path.exists(args.model_path): os.makedirs(args.model_path) # 학습 하이퍼파라미터 설정 optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optim_schedule = ScheduledOptim(optimizer, args.d_hidden, n_warmup_steps=args.warmup_steps) def mask_ce(logits, labels, mask): x = F.log_softmax(logits, dim=-1) loss = F.nll_loss(x.transpose(1,2), labels, reduction='none') * mask return loss.sum() / (mask.sum() + 1e-5) criterion = mask_ce # data loader 설정 suffix = f'v{args.vocab_size}_t{args.max_num_tokens}' tokenizer = Tokenizer(tokenizer_name=args.tokenizer_name, prefix='vocab_{}'.format(args.vocab_size)) train_dataset = BertLMDataset(f'bertlm_train_{suffix}.json', tokenizer=tokenizer) val_dataset = BertLMDataset(f'bertlm_val_{suffix}.json', tokenizer=tokenizer) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_data_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) print(args) # print(model) def test(): ''' test_data_loader로 [Acc, loss] 테스트 수행 '''
def __init__(self, opts): if opts.task == 'POS_models': data_dir = 'data/pos_data' self.kfold = False elif opts.jackknife: data_dir = 'data/super_data' jk_data_dir = 'data/pos_data' path_to_k_fold = os.path.join(jk_data_dir, 'train_y.txt') path_to_k_fold_test = os.path.join(jk_data_dir, 'test_y.txt') self.kfold = True else: data_dir = 'data/super_data' self.kfold = False path_to_text = os.path.join(data_dir, 'train_x.txt') path_to_text_test = os.path.join(data_dir, 'test_x.txt') path_to_POS = os.path.join(data_dir, 'train_y.txt') path_to_POS_test = os.path.join(data_dir, 'test_y.txt') self.MAX_NB_WORDS = 200000000000 # first, build index mapping words in the embeddings set # to their embedding vector f_train = open(path_to_text) f_test = open(path_to_text_test) texts = f_train.readlines() nb_train_samples = len(texts) self.nb_train_samples = nb_train_samples texts = texts + f_test.readlines() f_train.close() f_test.close() print('length', len(texts)) f_train.close() # f_test.close() # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) indicator = tokenizer.cap_indicator(texts) num_indicator = tokenizer.num_indicator(texts) suffix = tokenizer.suffix_extract(texts) suffix_tokenizer = Tokenizer() suffix_tokenizer.fit_on_texts(suffix, non_split=True) suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True) # debugging # for i in xrange(len(sequences)): # assert len(sequences[i]) == len(suffix_sequences[i]) word_index = tokenizer.word_index self.word_index = word_index suffix_index = suffix_tokenizer.word_index print('Found %s unique words.' % len(word_index)) data = pad_sequences(sequences, opts, True) suffix_data = pad_sequences(suffix_sequences, opts) cap_indicator = pad_sequences(indicator, opts) num_indicator = pad_sequences(num_indicator, opts) f_train = open(path_to_POS) f_test = open(path_to_POS_test) texts = f_train.readlines() + f_test.readlines() f_train.close() f_test.close() lab_tokenizer = Tokenizer() lab_tokenizer.fit_on_texts(texts) lab_sequences = lab_tokenizer.texts_to_sequences(texts) tag_index = lab_tokenizer.word_index self.tag_index = tag_index self.tag_size = len(tag_index) print('Found %s unique tags.' % len(tag_index)) labels = pad_sequences(lab_sequences, opts) #labels = np.expand_dims(labels, -1) do not need it for tensorflow if opts.jackknife: f_train = open(path_to_k_fold) f_test = open(path_to_k_fold_test) texts = f_train.readlines() + f_test.readlines() f_train.close() f_test.close() jk_tokenizer = Tokenizer() jk_tokenizer.fit_on_texts(texts) jk_sequences = jk_tokenizer.texts_to_sequences(texts) jk_index = jk_tokenizer.word_index self.jk_index = jk_index self.jk_size = len(jk_index) print('Found %s unique jackknife tags.' % len(jk_index)) jk_labels = pad_sequences(jk_sequences, opts) indices = np.arange(nb_train_samples) np.random.shuffle(indices) nb_validation_samples = data.shape[0] - nb_train_samples self.nb_validation_samples = nb_validation_samples ### define zero matrix first for splitting seq_length = labels.shape[1] if opts.attention in [100, 101, 102, 103]: self.nb_train_added = nb_train_samples // 10 * 10 + 10 self.nb_validation_added = nb_validation_samples // 10 * 10 + 10 else: self.nb_train_added = nb_train_samples self.nb_validation_added = nb_validation_samples self.X_train = np.zeros([self.nb_train_added, seq_length]) self.X_train[:nb_train_samples] = data[:-nb_validation_samples][ indices] if opts.jackknife: self.jk_labels = np.zeros([self.nb_train_added, seq_length]) self.jk_labels[:nb_train_samples] = jk_labels[indices] self.jk_labels_test = np.zeros( [self.nb_validation_added, seq_length]) self.jk_labels_test[:nb_validation_samples] = jk_labels[ -nb_validation_samples:] self.train_cap_indicator = np.zeros([self.nb_train_added, seq_length]) self.train_cap_indicator[: nb_train_samples] = cap_indicator[: -nb_validation_samples][ indices] self.train_num_indicator = np.zeros([self.nb_train_added, seq_length]) self.train_num_indicator[: nb_train_samples] = num_indicator[: -nb_validation_samples][ indices] self.suffix_train = np.zeros([self.nb_train_added, seq_length]) self.suffix_train[: nb_train_samples] = suffix_data[: -nb_validation_samples][ indices] self.y_train = np.zeros([self.nb_train_added, seq_length]) self.y_train[:nb_train_samples] = labels[:-nb_validation_samples][ indices] if opts.joint: self.pos_train = self.jk_labels self.X_test = np.zeros([self.nb_validation_added, seq_length]) self.X_test[:nb_validation_samples] = data[-nb_validation_samples:] self.test_cap_indicator = np.zeros( [self.nb_validation_added, seq_length]) self.test_cap_indicator[:nb_validation_samples] = cap_indicator[ -nb_validation_samples:] self.test_num_indicator = np.zeros( [self.nb_validation_added, seq_length]) self.test_num_indicator[:nb_validation_samples] = num_indicator[ -nb_validation_samples:] self.suffix_test = np.zeros([self.nb_validation_added, seq_length]) self.suffix_test[:nb_validation_samples] = suffix_data[ -nb_validation_samples:] self.y_test = np.zeros([self.nb_validation_added, seq_length]) self.y_test[:nb_validation_samples] = labels[-nb_validation_samples:] if opts.joint: self.pos_test = self.jk_labels_test if opts.jackknife: K = 10 #k_fold_samples = nb_train_samples//K*K samples_per_group = (nb_train_samples // K) + 1 print('splitting into {} folds'.format(K)) ## don't get rid of the remainders. We will save all of them # self.X_train = self.X_train[:k_fold_samples] # get rid of the remaining examples for kfold # self.train_cap_indicator = self.train_cap_indicator[:k_fold_samples] # self.train_num_indicator = self.train_num_indicator[:k_fold_samples] # self.suffix_train = self.suffix_train[:k_fold_samples] # # self.X_train_k_fold = np.split(self.X_train[:k_fold_samples], K) # self.train_cap_indicator_k_fold = np.split(self.train_cap_indicator[:k_fold_samples], K) # self.train_num_indicator_k_fold = np.split(self.train_num_indicator[:k_fold_samples], K) # self.suffix_train_k_fold = np.split(self.suffix_train[:k_fold_samples], K) # self.y_train_k_fold = np.split(self.jk_labels[:k_fold_samples], K) # # adding everything back self.X_train_k_fold = [] self.train_cap_indicator_k_fold = [] self.train_num_indicator_k_fold = [] self.suffix_train_k_fold = [] self.y_train_k_fold = [] for k in xrange(K): self.X_train_k_fold.append( self.X_train[samples_per_group * k:samples_per_group * (k + 1)]) self.train_cap_indicator_k_fold.append( self.train_cap_indicator[samples_per_group * k:samples_per_group * (k + 1)]) self.train_num_indicator_k_fold.append( self.train_num_indicator[samples_per_group * k:samples_per_group * (k + 1)]) self.suffix_train_k_fold.append( self.suffix_train[samples_per_group * k:samples_per_group * (k + 1)]) self.y_train_k_fold.append( self.jk_labels[samples_per_group * k:samples_per_group * (k + 1)]) #if opts.joint: # self.pos_train = self.pos_train[:k_fold_samples] print('end splitting') self.nb_suffix = len(suffix_index) self.suffix_embedding_mat = np.random.randn(self.nb_suffix + 1, 10) self.nb_words = min(self.MAX_NB_WORDS, len(word_index)) ## cond entropy self.cond_matrix = np.ones((self.nb_words + 1, 1)) with open('certain.pkl') as fhand: certain = pickle.load(fhand) self.certain_words = [] for certain_word in certain: self.certain_words.append(self.word_index[certain_word]) for certain_word in self.certain_words: self.cond_matrix[certain_word] = 0.0 ### cond entropy ends if opts.embedding_name == 'random': np.random.seed(opts.seed) self.embedding_matrix = np.random.uniform( -2, 2, size=(self.nb_words + 1, opts.embedding_dim)) elif opts.embedding_name == 'word2vec': if not opts.embedding_dim == 300: # word2vec is of 300 dim sys.exit('error in dim') filename = os.path.join('../word2vec', 'GoogleNews-vectors-negative300.bin') import gensim self.embedding_matrix = np.zeros( (self.nb_words + 1, opts.embedding_dim)) self.word2vec_model = gensim.models.word2vec.Word2Vec.load_word2vec_format( filename, binary=True) print('Found %s word vectors.' % len(self.word2vec_model.vocab)) for word, i in word_index.items(): if i > self.MAX_NB_WORDS and word in self.word2vec_model.vocab: self.embedding_matrix[i] = self.word2vec_model[word] else: self.embeddings_index = {} print('Indexing word vectors.') f = open(opts.embedding_name) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(self.embeddings_index)) self.embedding_matrix = np.zeros( (self.nb_words + 1, opts.embedding_dim)) for word, i in word_index.items(): if i > self.MAX_NB_WORDS: continue embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. if not self.embedding_matrix.shape[1] == len( embedding_vector): sys.exit('error in dim') self.embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer self._index_in_epoch = 0 self._num_examples = self.X_train.shape[0] self._num_test_examples = self.X_test.shape[0] self._epoch_completed = 0 self._index_in_test = 0 if opts.jackknife: # self._num_hold_in_examples = self.X_train_k_fold[0].shape[0]*(K-1) # self._num_hold_out_examples = self.X_train_k_fold[0].shape[0] self.k = 0 self.opts = opts
def get_data(force=False): picklefile = os.path.join(CACHE_DIR, 'data.pickle') if not force and os.path.isfile(picklefile): print('Loading data from pickle...') data = pickle.load(open(picklefile, 'rb')) return data print('\nLoading data...') (X_train, y_train), (X_dev, y_dev), (X_test, y_test) = load_data() print(len(X_train), 'train sequences.') print(len(X_dev), 'dev sequences.') print(len(X_test), 'test sequences.') print('\nExtracting features...') X_train_sep_feats = extract_features(X_train) X_dev_sep_feats = extract_features(X_dev) X_test_sep_feats = extract_features(X_test) X_train_feat_tokens = [] X_dev_feat_tokens = [] X_test_feat_tokens = [] feature_sizes = [] for i in range(8): feat_tokenizer = Tokenizer(lower=False, cutoff=3, nb_unknowns=1, padding=True) feat_tokenizer.fit_on_texts(X_train_sep_feats[i]) X_train_feat = feat_tokenizer.texts_to_sequences(X_train_sep_feats[i]) X_dev_feat = feat_tokenizer.texts_to_sequences(X_dev_sep_feats[i]) X_test_feat = feat_tokenizer.texts_to_sequences(X_test_sep_feats[i]) X_train_feat_tokens.append(X_train_feat) X_dev_feat_tokens.append(X_dev_feat) X_test_feat_tokens.append(X_test_feat) feat_size = len(feat_tokenizer.word_index) feature_sizes.append(feat_size) # get dev and test vocabulary print('\nDev vocab:') dev_tokenizer = Tokenizer(lower=True, cutoff=0, nb_unknowns=3) dev_tokenizer.fit_on_texts(X_dev, verbose=True) print(len(dev_tokenizer.word_index.keys())) print('\nTest vocab:') test_tokenizer = Tokenizer(lower=True, cutoff=0, nb_unknowns=3) test_tokenizer.fit_on_texts(X_test, verbose=True) print(len(test_tokenizer.word_index.keys())) extra_vocab = set(dev_tokenizer.word_index.keys()+test_tokenizer.word_index.keys()) print('\nTest/dev vocab: {}.'.format(len(extra_vocab))) print('\nTokenizing...') word_tokenizer = Tokenizer(lower=True, cutoff=0, nb_unknowns=3) word_tokenizer.fit_on_texts(X_train) X_train = word_tokenizer.texts_to_sequences(X_train) X_dev = word_tokenizer.texts_to_sequences(X_dev) X_test = word_tokenizer.texts_to_sequences(X_test) word_index = word_tokenizer.word_index tag_tokenizer = Tokenizer(lower=False, nb_words=425) tag_tokenizer.fit_on_texts(y_train) y_train = tag_tokenizer.texts_to_sequences(y_train) y_dev = tag_tokenizer.texts_to_sequences(y_dev) y_test = tag_tokenizer.texts_to_sequences(y_test) tag_index = tag_tokenizer.word_index print('Vocabulary size: {}. CCGTag size: {}.'.format(len(word_index), len(tag_index))) try: f = open(picklefile, 'wb') data = { 'X_train': X_train, 'X_test': X_test, 'X_dev': X_dev, 'y_train': y_train, 'y_test': y_test, 'y_dev': y_dev, 'tag_index': tag_index, 'word_index': word_index, 'X_train_feats': X_train_feat_tokens, 'X_dev_feats': X_dev_feat_tokens, 'X_test_feats': X_test_feat_tokens, 'feature_sizes': feature_sizes, 'extra_vocab': extra_vocab } pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) f.close() except Exception as e: print('Unable to save data to', picklefile, ':', e) raise return data
print('\nDev sample:') print(X_dev[0]) print(y_dev[0]) print('\nTest sample:') print(X_test[0]) print(y_test[0]) max_train_len = max([len(x) for x in X_train]) max_dev_len = max([len(x) for x in X_dev]) max_test_len = max([len(x) for x in X_test]) print('\nMax sentence length for train/dev/test: {}/{}/{}'.format(max_train_len, max_dev_len, max_test_len)) print('\nTokenizing...') word_tokenizer = Tokenizer(lower=True, cutoff=0, nb_unknowns=3) word_tokenizer.fit_on_texts(X_train, verbose=True) tag_tokenizer = Tokenizer(lower=False, nb_words=425) tag_tokenizer.fit_on_texts(y_train, verbose=True) X_train_t = word_tokenizer.texts_to_sequences(X_train, verbose=True) X_dev_t = word_tokenizer.texts_to_sequences(X_dev, verbose=True) X_test_t = word_tokenizer.texts_to_sequences(X_test, verbose=True) y_train_t = tag_tokenizer.texts_to_sequences(y_train, verbose=True) y_dev_t = tag_tokenizer.texts_to_sequences(y_dev, verbose=True) y_test_t = tag_tokenizer.texts_to_sequences(y_test, verbose=True) print('\nTokenized train sample:') print(X_train_t[0])
from preprocessing import remove_stopwords, lemmatize_words, Tokenizer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer configurations = { 'unigrams': { 'vectorizer': CountVectorizer( binary=True, tokenizer=Tokenizer().tokenize, ngram_range=[1, 1]) }, 'unigrams-tfidf': { 'vectorizer': TfidfVectorizer( tokenizer=Tokenizer().tokenize, ngram_range=[1, 1]) }, 'unigrams-lemmatization': { 'vectorizer': CountVectorizer( binary=True, tokenizer=Tokenizer([lemmatize_words]).tokenize, ngram_range=[1, 1]) }, 'unigrams-lemmatization-tfidf': { 'vectorizer': TfidfVectorizer( tokenizer=Tokenizer([lemmatize_words]).tokenize, ngram_range=[1, 1]) }, 'unigrams-lemmatization-stopwords': { 'vectorizer': CountVectorizer( binary=True, tokenizer=Tokenizer([lemmatize_words, remove_stopwords]).tokenize, ngram_range=[1, 1])
def __init__(self, opts, test_opts=None): path_to_text = opts.text_train path_to_tag = opts.tag_train path_to_jk = opts.jk_train if test_opts is None: path_to_text_test = opts.text_test path_to_tag_test = opts.tag_test path_to_jk_test = opts.jk_test else: path_to_text_test = test_opts.text_test path_to_tag_test = test_opts.tag_test path_to_jk_test = test_opts.jk_test self.inputs_train = {} self.inputs_test = {} ## indexing sents files f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() self.nb_train_samples = len(texts) f_train.close() tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(texts) #print(tokenizer.word_index['-unseen-']) self.word_index = tokenizer.word_index sorted_freqs = tokenizer.sorted_freqs self.nb_words = len(self.word_index) print('Found {} unique lowercased words including -unseen-.'.format(self.nb_words)) # lookup the glove word embeddings # need to reserve indices for testing file. glove_size = opts.embedding_dim self.embeddings_index = {} print('Indexing word vectors.') #f = open('glovevector/glove.6B.{}d.txt'.format(glove_size)) f = io.open(opts.word_embeddings_file, encoding='utf-8') for line in f: values = line.strip().split(' ') if len(values) == opts.embedding_dim+1: word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found {} word vectors.'.format(len(self.embeddings_index))) unseens = list(set(self.embeddings_index.keys()) - set(self.word_index.keys())) ## list of words that appear in glove but not in the training set nb_unseens = len(unseens) print('Found {} words not in the training set'.format(nb_unseens)) self.word_embeddings = np.zeros((self.nb_words+1+nb_unseens, glove_size)) ## +1 for padding (idx 0) ## Get Frequencies for Adversarial Training (Yasunaga et al. 2017) self.word_freqs = np.zeros([self.nb_words+1+nb_unseens]) self.word_freqs[1:self.nb_words] = sorted_freqs ## Skip Zero Padding (Index 0) self.word_freqs = self.word_freqs.astype(np.float32) self.word_freqs = self.word_freqs/np.sum(self.word_freqs) for word, i in self.word_index.items(): ## first index the words in the training set embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: ## otherwise zero vector self.word_embeddings[i] = embedding_vector for unseen in unseens: self.word_index[unseen] = len(self.word_index) + 1 ## add unseen words to the word_index dictionary self.word_embeddings[self.word_index[unseen]] = self.embeddings_index[unseen] self.idx_to_word = invert_dict(self.word_index) print('end glove indexing') f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() self.nb_validation_samples = len(texts) - self.nb_train_samples f_test.close() text_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples])) self.inputs_train['words'] = text_sequences[:self.nb_train_samples] self.inputs_test['words'] = text_sequences[self.nb_train_samples:] ## indexing sents files ends ## indexing suffixes if opts.suffix_dim > 0: suffix = tokenizer.suffix_extract(texts) suffix_tokenizer = Tokenizer() suffix_tokenizer.fit_on_texts(suffix[:self.nb_train_samples], non_split=True) self.suffix_index = suffix_tokenizer.word_index self.nb_suffixes = len(self.suffix_index) sorted_freqs = suffix_tokenizer.sorted_freqs self.suffix_freqs = np.zeros([self.nb_suffixes+1]).astype(np.float32) ## +1 for zero padding self.suffix_freqs[1:self.nb_suffixes] = sorted_freqs ## Skip Zero Padding (Index 0) self.suffix_freqs = self.suffix_freqs/np.sum(self.suffix_freqs) self.idx_to_suffix = invert_dict(self.suffix_index) print('Found {} unique suffixes including -unseen-.'.format(self.nb_suffixes)) suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True) #print(map(lambda x: self.idx_to_suffix[x], suffix_sequences[self.nb_train_samples])) self.inputs_train['suffix'] = suffix_sequences[:self.nb_train_samples] self.inputs_test['suffix'] = suffix_sequences[self.nb_train_samples:] ## indexing suffixes ends ## indexing capitalization if opts.cap: cap_sequences = tokenizer.cap_indicator(texts) #print(cap_sequences[self.nb_train_samples]) self.inputs_train['cap'] = cap_sequences[:self.nb_train_samples] self.inputs_test['cap'] = cap_sequences[self.nb_train_samples:] ## indexing capitalization ends ## indexing numbers if opts.num: num_sequences = tokenizer.num_indicator(texts) #print(num_sequences[self.nb_train_samples]) self.inputs_train['num'] = num_sequences[:self.nb_train_samples] self.inputs_test['num'] = num_sequences[self.nb_train_samples:] ## indexing numbers ends ## indexing jackknife files if opts.jk_dim > 0: f_train = io.open(path_to_jk, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts) self.jk_index = tokenizer.word_index self.nb_jk = len(self.jk_index) sorted_freqs = tokenizer.sorted_freqs self.jk_freqs = np.zeros([self.nb_jk+1]).astype(np.float32) ## +1 for zero padding self.jk_freqs[1:self.nb_jk] = sorted_freqs ## Skip Zero Padding (Index 0) self.jk_freqs = self.jk_freqs/np.sum(self.jk_freqs) self.idx_to_jk = invert_dict(self.jk_index) print('Found {} unique tags including -unseen-.'.format(self.nb_jk)) f_test = io.open(path_to_jk_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() jk_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples] self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:] ## indexing jackknife files ends ## indexing char files if opts.chars_dim > 0: f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False,char_encoding=True) tokenizer.fit_on_texts(texts) self.char_index = tokenizer.word_index self.nb_chars = len(self.char_index) sorted_freqs = tokenizer.sorted_freqs self.char_freqs = np.zeros([self.nb_chars+1]).astype(np.float32) ## +1 for zero padding self.char_freqs[1:self.nb_chars] = sorted_freqs ## Skip Zero Padding (Index 0) self.char_freqs = self.char_freqs/np.sum(self.char_freqs) self.idx_to_char = invert_dict(self.char_index) print('Found {} unique characters including -unseen-.'.format(self.nb_chars)) f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() char_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['chars'] = char_sequences[:self.nb_train_samples] self.inputs_test['chars'] = char_sequences[self.nb_train_samples:] ## indexing char files ends ## indexing stag files f_train = open(path_to_tag) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) ## for tCO tokenizer.fit_on_texts(texts, zero_padding=False) #print(tokenizer.word_index['-unseen-']) self.tag_index = tokenizer.word_index self.nb_tags = len(self.tag_index) self.idx_to_tag = invert_dict(self.tag_index) print('Found {} unique tags including -unseen-.'.format(self.nb_tags)) f_test = open(path_to_tag_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() tag_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['tags'] = tag_sequences[:self.nb_train_samples] self.inputs_test['tags'] = tag_sequences[self.nb_train_samples:] ## indexing stag files ends self.test_gold = np.hstack(tag_sequences[self.nb_train_samples:]) ## for calculation of accuracy ## padding the train inputs and test inputs #self.inputs_train = [pad_sequences(x) for x in self.inputs_train] self.inputs_train = {key: pad_sequences(x, key) for key, x in self.inputs_train.items()} random.seed(0) perm = np.arange(self.nb_train_samples) random.shuffle(perm) self.inputs_train = {key: x[perm] for key, x in self.inputs_train.items()} #self.inputs_train = [x[perm] for x in self.inputs_train] #self.inputs_test = [pad_sequences(x) for x in self.inputs_test] self.inputs_test = {key: pad_sequences(x, key) for key, x in self.inputs_test.items()} ## setting the current indices self._index_in_epoch = 0 self._epoch_completed = 0 self._index_in_test = 0
def __init__(self, opts, test_opts=None): path_to_text = opts.text_train path_to_tag = opts.tag_train path_to_jk = opts.jk_train path_to_arc = opts.arc_train path_to_rel = opts.rel_train if test_opts is None: path_to_text_test = opts.text_test path_to_tag_test = opts.tag_test path_to_jk_test = opts.jk_test path_to_arc_test = opts.arc_test path_to_rel_test = opts.rel_test path_to_punc_test = opts.punc_test else: path_to_text_test = test_opts.text_test path_to_tag_test = test_opts.tag_test path_to_jk_test = test_opts.jk_test path_to_arc_test = test_opts.arc_test path_to_rel_test = test_opts.rel_test path_to_punc_test = test_opts.punc_test self.inputs_train = {} self.inputs_test = {} ## indexing sents files f_train = open(path_to_text) texts = f_train.readlines() self.nb_train_samples = len(texts) f_train.close() tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(texts) #print(tokenizer.word_index['-unseen-']) self.word_index = tokenizer.word_index self.nb_words = len(self.word_index) print( 'Found {} unique lowercased words including -unseen- and <-root->.' .format(self.nb_words)) # lookup the glove word embeddings # need to reserve indices for testing file. glove_size = opts.embedding_dim self.embeddings_index = {} print('Indexing word vectors.') f = open(opts.word_embeddings_file) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found {} word vectors.'.format(len(self.embeddings_index))) unseens = list( set(self.embeddings_index.keys()) - set(self.word_index.keys()) ) ## list of words that appear in glove but not in the training set nb_unseens = len(unseens) print('Found {} words not in the training set but in the glove data'. format(nb_unseens)) self.word_embeddings = np.zeros( (self.nb_words + 1 + nb_unseens, glove_size)) ## +1 for padding (idx 0) for word, i in self.word_index.items( ): ## first index the words in the training set embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: ## otherwise zero vector self.word_embeddings[i] = embedding_vector for unseen in unseens: self.word_index[unseen] = len( self.word_index ) + 1 ## add unseen words to the word_index dictionary self.word_embeddings[ self.word_index[unseen]] = self.embeddings_index[unseen] self.idx_to_word = invert_dict(self.word_index) print('end glove indexing') f_test = open(path_to_text_test) texts = texts + f_test.readlines() self.nb_validation_samples = len(texts) - self.nb_train_samples f_test.close() text_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples])) self.inputs_train['words'] = text_sequences[:self.nb_train_samples] self.inputs_test['words'] = text_sequences[self.nb_train_samples:] ## indexing sents files ends ## indexing char files if opts.chars_dim > 0: f_train = io.open(path_to_text, encoding='utf-8') texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False, char_encoding=True, root=False) ## char embedding for <-root-> does not make sense tokenizer.fit_on_texts( texts) ## char embedding for <-root-> does not make sense self.char_index = tokenizer.word_index self.nb_chars = len(self.char_index) self.idx_to_char = invert_dict(self.char_index) print( 'Found {} unique characters including -unseen-. NOT including <-root->.' .format(self.nb_chars)) f_test = io.open(path_to_text_test, encoding='utf-8') texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() char_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples])) self.inputs_train['chars'] = char_sequences[:self.nb_train_samples] self.inputs_test['chars'] = char_sequences[self.nb_train_samples:] ## indexing char files ends ## indexing jackknife files if (opts.jk_dim > 0) or (opts.model in ['Parsing_Model_Joint_Both']): f_train = open(path_to_jk) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts, zero_padding=False) self.jk_index = tokenizer.word_index self.nb_jk = len(self.jk_index) self.idx_to_jk = invert_dict(self.jk_index) print('Found {} unique POS tags including -unseen- and <-root->.'. format(self.nb_jk)) f_test = open(path_to_jk_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() jk_sequences = tokenizer.texts_to_sequences(texts) self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples] self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:] self.gold_jk = np.hstack( map(lambda x: x[1:], jk_sequences[self.nb_train_samples:])) ## indexing jackknife files ends ## indexing stag files if (opts.stag_dim > 0) or (opts.model in [ 'Parsing_Model_Joint', 'Parsing_Model_Shuffle', 'Parsing_Model_Joint_Both' ]): f_train = open(path_to_tag) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) ## for tCO tokenizer.fit_on_texts(texts, zero_padding=False) ## if zero_padding is True, index 0 is reserved, never assigned to an existing word self.tag_index = tokenizer.word_index self.nb_stags = len(self.tag_index) self.idx_to_tag = invert_dict(self.tag_index) print('Found {} unique supertags including -unseen- and <-root->.'. format(self.nb_stags)) f_test = open(path_to_tag_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() tag_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['stags'] = tag_sequences[:self.nb_train_samples] self.inputs_test['stags'] = tag_sequences[self.nb_train_samples:] self.gold_stags = np.hstack( map(lambda x: x[1:], tag_sequences[self.nb_train_samples:])) ## indexing stag files ends ## indexing rel files f_train = open(path_to_rel) texts = f_train.readlines() f_train.close() tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(texts, zero_padding=False) self.rel_index = tokenizer.word_index self.nb_rels = len(self.rel_index) self.idx_to_rel = invert_dict(self.rel_index) print( 'Found {} unique rels including -unseen-, NOT including <-root->.'. format(self.nb_rels)) f_test = open(path_to_rel_test) texts = texts + f_test.readlines() ## do not lowercase tCO f_test.close() rel_sequences = tokenizer.texts_to_sequences(texts) #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8])) self.inputs_train['rels'] = rel_sequences[:self.nb_train_samples] self.inputs_test['rels'] = rel_sequences[self.nb_train_samples:] self.gold_rels = np.hstack( map(lambda x: x[1:], rel_sequences[self.nb_train_samples:])) ## indexing rel files ends ## indexing arc files ## Notice arc sequences are already integers f_train = open(path_to_arc) arc_sequences = f_train.readlines() f_train.close() f_test = open(path_to_arc_test) arc_sequences = arcs2seq(arc_sequences + f_test.readlines()) f_test.close() self.inputs_train['arcs'] = arc_sequences[:self.nb_train_samples] self.inputs_test['arcs'] = arc_sequences[self.nb_train_samples:] ## indexing arc files ends self.gold_arcs = np.hstack(arc_sequences[self.nb_train_samples:]) if path_to_punc_test is not None: self.punc = arc_sequences[self.nb_train_samples:] with open(path_to_punc_test) as fhand: for sent_idx, line in zip(xrange(len(self.punc)), fhand): self.punc[sent_idx] = [ True for _ in xrange(len(self.punc[sent_idx])) ] for punc_idx in map(int, line.split()): self.punc[sent_idx][punc_idx - 1] = False self.punc = np.hstack(self.punc) #.astype(bool) ## padding the train inputs and test inputs self.inputs_train = { key: pad_sequences(x, key) for key, x in self.inputs_train.items() } self.inputs_train['arcs'] = np.hstack([ np.zeros([self.inputs_train['arcs'].shape[0], 1]).astype(int), self.inputs_train['arcs'] ]) ## dummy parents for the roots random.seed(0) perm = np.arange(self.nb_train_samples) random.shuffle(perm) self.inputs_train = { key: x[perm] for key, x in self.inputs_train.items() } self.inputs_test = { key: pad_sequences(x, key) for key, x in self.inputs_test.items() } ## dummy parents for the roots self.inputs_test['arcs'] = np.hstack([ np.zeros([self.inputs_test['arcs'].shape[0], 1]).astype(int), self.inputs_test['arcs'] ]) ## padding ends ## setting the current indices self._index_in_epoch = 0 self._epoch_completed = 0 self._index_in_test = 0