def build_embeddings(self, embeddings_vec_path, *raw_datas, oov_as_unk=True, lower=True): """ Build Embeddings object that includes vector of words in data. Args: embeddings_vec_path (str): Path to the pretrained word vector file. Ex. FastText. raw_datas (list of dict): List of raw data **TOKENIZED** with tokenize_data load from json file. oov_as_unk (bool): Whether or not treat words not in pretrained word vectors set as OOVs. Otherwise, OOVs' embeddings will be randomly initialized. """ words = {} for raw_data in raw_datas: words = self._collect_words(raw_data, words) self.embeddings = Embeddings(embeddings_vec_path, words, oov_as_unk, lower=True) self.embeddings.add('<pad>', torch.tensor([0.] * self.embeddings.get_dim())) self.embeddings.add('<teacher>') self.embeddings.add('<student>') self.embeddings.add('CANNOTANSWER')
def __init__(self, text, args, device): super(NMT, self).__init__() self.text = text self.args = args self.device = device self.Embeddings = Embeddings(args['embed_size'], self.text) self.encoder_layer = nn.TransformerEncoderLayer( d_model=args['d_model'], nhead=args['nhead'], dim_feedforward=args['dim_feedforward'], dropout=args['dropout']) self.encoder_norm = nn.LayerNorm(args['d_model']) self.encoder = nn.TransformerEncoder( encoder_layer=self.encoder_layer, num_layers=args['num_encoder_layers'], norm=self.encoder_norm) self.decoder_layer = nn.TransformerDecoderLayer( d_model=args['d_model'], nhead=args['nhead'], dim_feedforward=args['dim_feedforward'], dropout=args['dropout']) self.decoder_norm = nn.LayerNorm(args['d_model']) self.decoder = nn.TransformerDecoder( decoder_layer=self.decoder_layer, num_layers=args['num_decoder_layers'], norm=self.decoder_norm) self.project = nn.Linear(args['d_model'], len(self.text.tar), bias=False) self.project.weight = self.Embeddings.tar.weight self.dropout = nn.Dropout(args['dropout']) self.project_value = math.pow(args['d_model'], 0.5) self.eps = args['smoothing_eps']
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy # The dimension of multi-head model is the same as the embedding. Is it must? attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def choose_mnist(self): print "CHOSE MNIST" global predictor, autoencode_model, embeddings predictor = autoencode_predict.predict( name="meta-data/mnist/autoencode_model", color_depth=1) predictor.stop() predictor.restore() autoencode_model = predictor.autoencode_model embeddings = Embeddings(predictor) print "Loading images ..." if 'mnist' not in self.data_sets: print "Key missing. Building ImageData" imageData = LazyLoadWrapper( BatchWrapper( ResizeWrapper(ReshapeWrapper(Mnist(), [28, 28, 1]), [32, 32]))) imageData.getImages() self.data_sets['mnist'] = imageData print " mnist shape is", self.data_sets['mnist'].getImages().shape print "... loading images done" embeddings.data_set = self.data_sets['mnist'].getImages() return self.data_sets['mnist']
def __init__(self, iT, corefs, model): self.iT = iT self.corefs = corefs self.embeddings = Embeddings(model) dist, components = self.computeProgression() self.distances = dist self.components = components
def __init__(self, X_train: list, Y_train: list, embed_path: str, embed_dim: int, stop_words=[], X_test=[], Y_test=[], max_len=None, epochs=3, batch_size=256): # Preprocessing the text X_train = [clean_text(text, stop_words=stop_words) for text in X_train] Y_train = np.asarray(Y_train) # Tokenizing the text tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) # Saving the tokenizer self.tokenizer = tokenizer # Creating the embedding matrix embedding = Embeddings(embed_path, embed_dim) embedding_matrix = embedding.create_embedding_matrix( tokenizer, len(tokenizer.word_counts)) # Creating the padded input for the deep learning model if max_len is None: max_len = np.max([len(text.split()) for text in X_train]) TextToTensor_instance = TextToTensor(tokenizer=tokenizer, max_len=max_len) X_train = TextToTensor_instance.string_to_tensor(X_train) # Creating the model rnn = RnnModel(embedding_matrix=embedding_matrix, embedding_dim=embed_dim, max_len=max_len) rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs) self.model = rnn.model # If X_test is provided we make predictions with the created model if len(X_test) > 0: X_test = [clean_text(text) for text in X_test] X_test = TextToTensor_instance.string_to_tensor(X_test) yhat = [x[0] for x in rnn.model.predict(X_test).tolist()] self.yhat = yhat # If true labels are provided we calculate the accuracy of the model if len(Y_test) > 0: self.acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat]) self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
def __init__(self, data_name, num_class=5): self.data_name = data_name self.train_data_path = '../data/' + self.data_name + '/train.txt' self.test_data_path = '../data/' + self.data_name + '/test.txt' self.dev_data_path = '../data/' + self.data_name + '/dev.txt' self.embeddings = Embeddings(data_name) self.num_class = num_class start_time = time.time() self.load_data() print('Reading datasets comsumes %.3f seconds' % (time.time() - start_time))
class Pipeline(): def __init__(self, text, model='Word2Vec'): self.model = Embeddings(model) self.model.fit_corpus(text) self.model.train() def evaluate(self, test='word-similarity', datasets=['wordsim353-rel']): evaluator = Evaluator(test='word-similarity', datasets=['wordsim353-rel'], metric='spearman') return evaluator.evaluate(self.model)
def set_up(): if request.is_json: content = request.get_json() if content['key'] == 'fox': Loader().download_all_models() if content['key'] == 'snake': embedding_model = Embeddings() if content['key'] == 'sitara': Loader().download_all_models() embedding_model = Embeddings() return 'All data is downloaded'
class Sentiment(object): """NP_chunking data preparation""" def __init__(self, data_name, num_class=5): self.data_name = data_name self.train_data_path = '../data/' + self.data_name + '/train.txt' self.test_data_path = '../data/' + self.data_name + '/test.txt' self.dev_data_path = '../data/' + self.data_name + '/dev.txt' self.embeddings = Embeddings(data_name) self.num_class = num_class start_time = time.time() self.load_data() print('Reading datasets comsumes %.3f seconds' % (time.time() - start_time)) def deal_with_data(self, path): users, products, labels, docs, len_docs, len_words = [], [], [], [], [], [] k = 0 for line in open(path, 'r', encoding='UTF-8'): tokens = line.strip().split('\t\t') users.append(tokens[0]) products.append(tokens[1]) labels.append(int(tokens[2]) - 1) doc = tokens[3].strip().split('<sssss>') len_docs.append(len(doc)) doc = [sentence.strip().split(' ') for sentence in doc] len_words.append([len(sentence) for sentence in doc]) docs.append(doc) k += 1 return users, products, labels, docs def load_data(self): train_users, train_products, train_labels, train_docs = self.deal_with_data( self.train_data_path) test_users, test_products, test_labels, test_docs = self.deal_with_data( self.test_data_path) dev_users, dev_products, dev_labels, dev_docs = self.deal_with_data( self.dev_data_path) train_docs = self.embeddings.docs2ids(train_docs) test_docs = self.embeddings.docs2ids(test_docs) dev_docs = self.embeddings.docs2ids(dev_docs) train_users = self.embeddings.users2ids(train_users) test_users = self.embeddings.users2ids(test_users) dev_users = self.embeddings.users2ids(dev_users) train_products = self.embeddings.prdts2ids(train_products) test_products = self.embeddings.prdts2ids(test_products) dev_products = self.embeddings.prdts2ids(dev_products) self.train_set = list( zip(train_docs, train_labels, train_users, train_products)) self.test_set = list( zip(test_docs, test_labels, test_users, test_products)) self.dev_set = list(zip(dev_docs, dev_labels, dev_users, dev_products))
def get_pretrained_embeddings(path, vocab, method='word2vec'): emb = Embeddings() model = emb.load_model(method=method, model_path=path) embed_size = model.vector_size embeddings = np.zeros((len(vocab),embed_size)) oov_count = 0 for word in vocab: word_index = vocab[word] if word in model.vocab: embeddings[word_index] = model[word] else: oov_count += 1 print('OOV count: %i'%oov_count) return embeddings.astype('float32')
def __init__(self): self.embeddings = Embeddings(path='Data/wordvectors.kv') with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp: self.document_frequencies = pickle.load(fp) with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp: self.term_frequencies = pickle.load(fp) with open('Data/ranking_dict/document_length_text.p', 'rb') as fp: self.document_length = pickle.load(fp) self.num_documents = len(self.term_frequencies) self.avg_length = mean(self.document_length.values())
def test(self, test_dir, log_file, pos_thres, neg_thres): logging.info("Testing network using " + test_dir) iter = DataSetIterator(self, test_dir, 0) self._cat_embeddings = Embeddings(self._model_dir + "/cat.emb", w2v_layer_size, 0, False) self._slot_embeddings = Embeddings(self._model_dir + "/slot.emb", w2v_layer_size, 0, False) self._dist_embeddings = Embeddings(self._model_dir + "/dist.emb", w2v_layer_size, 0, False) self._pos_embeddings = Embeddings(self._model_dir + "/pos.emb", w2v_layer_size, 0, False) saver = tf.train.Saver() model_path = self._model_dir + "/model.out" with tf.Session() as sess: saver.restore(sess, model_path) batch_xs, batch_ys, records_in_batch = iter.next() logging.info("Number of test examples: " + str(len(records_in_batch))) correct_prediction = tf.equal(tf.argmax(self._network,1), tf.argmax(self._y,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) y_p = tf.argmax(self._network, 1) y_p_raw = tf.split(1, 2, tf.nn.softmax(self._network))[1] val_accuracy, y_network, y_network_raw = sess.run([accuracy, y_p, y_p_raw], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: 1.0, self._hidden_keep_prob: 1.0}) y_true = np.argmax(batch_ys, 1) logging.info("Accuracy: " + str(val_accuracy)) self._evaluate_thresholds(y_true, y_network_raw, pos_thres, neg_thres) with open(log_file+".classified1", "w") as out_correct, \ open(log_file+".classified0", "w") as out_incorrect: logging.info("Writing to files") for i in range(len(records_in_batch)): prediction = y_network[i] if prediction >= 0.5: out_correct.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n") else: out_incorrect.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n") logging.info("Network testing complete")
def test_all_terms_have_embeddings(path_terms: str, path_embeddings: str) -> None: """Test if all terms have embeddings for the given two files. Args: path_terms: Text-file with 1 term per line. path_embeddings: vec-file with term and dimension values separated by space. """ terms = load_terms(path_terms) idx_to_term = json.load('<intput here correct path>') embeddings = Embeddings.load_term_embeddings(terms, path_embeddings, idx_to_term) embedded_terms = set(embeddings) not_in_et = [] for t in terms: if t not in embedded_terms: not_in_et.append(t) if len(not_in_et) != 0: msg1 = 'Error! Not all terms have embeddings. ' msg2 = 'Num terms without embeddings: {}. '.format(len(not_in_et)) if len(not_in_et) < 20: msg3 = 'Terms without embeddings: {}'.format(not_in_et) else: msg3 = '' raise Exception(msg1 + msg2 + msg3)
def test_word2vec_set(): embed = Embeddings('./data/word2vec.txt', True, word_set={'a', 'b', 'c'}) matrix = embed.matrix assert matrix.shape == (5, 3) assert len(embed.vocab) == 3 assert (matrix[embed['a']] == np.ones((1, ))).all() assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
def __init__(self, text, options, device): super(NMT, self).__init__() self.options = options self.embeddings = Embeddings(options.embed_size, text) self.hidden_size = options.hidden_size self.window_size_d = options.window_size_d self.text = text self.device = device self.encoder_layer = options.encoder_layer self.decoder_layers = options.decoder_layers self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False) self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False) self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False) self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False) self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False) self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False)
def embedKG(self): self.logger.info("Embedding NP and relation phrases"); fname1 = self.p.out_path + self.p.file_entEmbed fname2 = self.p.out_path + self.p.file_relEmbed if not checkFile(fname1) or not checkFile(fname2): embed = Embeddings(self.p, self.side_info, self.logger) embed.fit() self.ent2embed = embed.ent2embed # Get the learned NP embeddings self.rel2embed = embed.rel2embed # Get the learned RP embeddings pickle.dump(self.ent2embed, open(fname1, 'wb')) pickle.dump(self.rel2embed, open(fname2, 'wb')) else: self.logger.info('\tLoading cached Embeddings') self.ent2embed = pickle.load(open(fname1, 'rb')) self.rel2embed = pickle.load(open(fname2, 'rb'))
def choose_garden(self): print "CHOSE GARDEN" global predictor, autoencode_model, embeddings predictor = autoencode_predict.predict( name="meta-data/garden/garden_model", color_depth=3) predictor.stop() predictor.restore() autoencode_model = predictor.autoencode_model embeddings = Embeddings(predictor) config_data = json.load(open("data/file_data.json", "r")) print "Loading images ..." if 'garden' not in self.data_sets: print "Key missing. Building ImageData" print "Loading files ...", files = LazyLoadWrapper( ResizeWrapper( FileReader(config_data["file_names"], config_data["labels"]), [64, 64])) files.init() print "done." print "Calculating full size ...", full_size = LazyLoadWrapper(ResizeWrapper(files, [32, 32])) full_size.init() print "done." print "Calculating half size ...", half_size = LazyLoadWrapper(SliceWrapper(files, 32, 16)) half_size.init() print "done." print "Calculating concat the whole thing ...", self.data_sets['garden'] = LazyLoadWrapper( BatchWrapper(ConcatWrapper([full_size, half_size]))) print "done." self.data_sets['garden'].getImages() print " garden shape is", self.data_sets['garden'].getImages().shape print "... loading images done" embeddings.data_set = self.data_sets['garden'].getImages() return self.data_sets['garden']
def embeddings(args): kf = KFold(n_splits=args.splits_num, shuffle=args.shuffle, random_state=42) score_lst = list() for fold, (train_index, valid_index) in enumerate(kf.split(users)): train_users = users[train_index] train_movies = movies[train_index] train_ratings = ratings[train_index] valid_users = users[valid_index] valid_movies = movies[valid_index] valid_ratings = ratings[valid_index] model = Embeddings( number_of_users, number_of_movies, embeddings_size=args.embeddings_size, dropout_embeddings=args.embeddings_dropout_embeddings, dropout=args.embeddings_dropout) model.fit(train_users, train_movies, train_ratings, valid_users=valid_users, valid_movies=valid_movies, valid_ratings=valid_ratings, epochs=args.embeddings_num_epochs, verbose=args.verbose, decay=args.embeddings_decay, decay_steps=args.embeddings_decay_steps, learning_rate=args.embeddings_learning_rate, batch_size=args.embeddings_batch_size) preds = model.predict(valid_users, valid_movies) score = root_mean_square_error(valid_ratings, preds) score_lst.append(score) print("Fold:", fold + 1, "score:", score) print('Mean CV RMSE:', np.mean(score_lst))
def embedding(): if request.is_json: content = request.get_json() serializer = EmbeddingSerializer(data=content) if not serializer.is_valid(): return 'Error' text = serializer.text token = serializer.token vector = Embeddings().build_sentence_vector(text).tolist() data = json.dumps({"vector": vector, "token": token}) return data
def get_clus_center( node: int, taxonomy: taxonomy_type, path_out: str, ) -> Iterator[float]: """Get the cluster center for given node id.""" emb_path = os.path.join(path_out, 'embeddings/' + str(node) + '.vec') term_ids = set([t[0] for t in taxonomy[node]['terms']]) local_embeddings = Embeddings.load_term_embeddings( term_ids, emb_path, term_ids_to_embs_global) clus_center = mean(local_embeddings, axis=0) return clus_center
def load_term_ids_to_embs_global(lemmatized: bool, emb_type: str, path_out: str): """Load global term embeddings.""" global term_ids_to_embs_global path_emb_dir = os.path.join(path_out, 'embeddings/') if lemmatized: fname = 'embs_lemma_global_{}.vec'.format(emb_type) else: fname = 'embs_token_global_{}.vec'.format(emb_type) emb_path = path_emb_dir + fname term_ids = load_term_ids(lemmatized, path_out) term_ids_to_embs_global = Embeddings.load_term_embeddings( term_ids, emb_path, {})
def __init__(self, path: str): """Initialize a hypernym classifier. Currently only svm classification is implemented. Args: path: The path to the output directory. """ # Set paths. self.path = path self.path_idx_to_term = os.path.join( path, 'indexing/idx_to_token.json') self.path_term_to_idx = os.path.join( path, 'indexing/token_to_idx.json') self.path_embs = os.path.join( path, 'embeddings/embs_token_global_Word2Vec.vec') self.path_hearst = os.path.join( path, 'hierarchy/hierarch_rels_tokens_tg_idx.json') # Load data. with open(self.path_idx_to_term, 'r', encoding='utf8') as f: self.idx_to_term = {int(k): v for k, v in json.load(f).items()} with open(self.path_term_to_idx, 'r', encoding='utf8') as f: self.term_to_idx = json.load(f) with open(self.path_hearst, 'r', encoding='utf8') as f: self.hearst = {int(k): v for k, v in json.load(f).items()} self.hearst_term_ids = set() for hyper in self.hearst: self.hearst_term_ids.add(hyper) for hypo in self.hearst[hyper]: self.hearst_term_ids.add(hypo) self.path_term_idxs = os.path.join( self.path, 'processed_corpus/token_terms_idxs.txt') with open(self.path_term_idxs, 'r', encoding='utf8') as f: self.term_ids = set([int(i) for i in f.readlines()]) self.embedding_dict = Embeddings.load_term_embeddings( set(self.idx_to_term.keys()), self.path_embs, self.idx_to_term) # Instanciate classifier. self.clf = SVC(kernel='rbf', C=10, gamma=0.1, probability=True, random_state=0)
def load_embeddings_vocab(self): pretrained_embeddings = Embeddings() # read filtered embeddings if not tf.gfile.Exists(config.filtered_embeddings_path): word_to_vec = pretrained_embeddings.load_universal_embeddings() self.create_vocabulary( self.vocab_file, pretrained_embeddings.all_words(word_to_vec), tokenizer=None) word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file) filtered_embeddings = pretrained_embeddings.filter_vocab_embeddings( word_to_vec, word_to_idx.keys()) with open(config.filtered_embeddings_path, 'wb') as output_file: pickle.dump(filtered_embeddings, output_file, protocol=pickle.HIGHEST_PROTOCOL) else: word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file) word_prob = self.read_unigram_freq(self.unigram_prob_file) assert 1.01 > sum( [0 if val is None else val for val in word_prob.values()]) > 0.99, "What?!" pre_embs_dict, embd_dim = pretrained_embeddings.load_filtered_pretrained_embeddings( config.filtered_embeddings_path) word_vec = pretrained_embeddings.get_embedding_matrix( pre_embs_dict, word_to_idx, embd_dim) self.word_vec = word_vec self.word_prob = word_prob self.word_to_idx = word_to_idx self.idx_to_word = idx_to_word train_path = os.path.join(config.data_dir, config.data_files['train']) dev_path = os.path.join(config.data_dir, config.data_files['dev']) self.write_data_to_token_ids(train_path, target_path=train_path) self.write_data_to_token_ids(dev_path, target_path=dev_path)
def prepare_word_embeddings(query_lang_emb, qlang_long, doc_lang_emb, dlang_long, limit_emb, normalize=False, processes=40): """ Creates Word Embedding Helper Object :param query_lang_emb: language of queries :param qlang_long: short version :param doc_lang_emb: language of documents :param dlang_long: short version :param limit_emb: load only first n embeddings :param normalize: transform to unit vectors :param processes: number of parallel workers :return: """ embeddings = Embeddings() embeddings.load_embeddings(query_lang_emb, processes=processes, language=qlang_long, limit=limit_emb, normalize=normalize) embeddings.load_embeddings(doc_lang_emb, processes=processes, language=dlang_long, limit=limit_emb, normalize=normalize) return embeddings
class Network: def __init__(self, path, train, helper): self._helper = helper self._train_bool = train n_properties = self._helper.n_properties n_input = n_properties * w2v_layer_size if self._train_bool: self._prev_model = path logging.info("Using previous word2vec model: " + self._prev_model) self._word_vectors = WordVectors(self._prev_model, w2v_layer_size, "UNKNOWN") else: self._model_dir = path self._word_vectors = WordVectors(self._model_dir + "/word2vec.txt", w2v_layer_size, "UNKNOWN") self._x = tf.placeholder("float", [None, n_input]) self._y = tf.placeholder("float", [None, n_classes]) self._input_keep_prob = tf.placeholder("float") self._hidden_keep_prob = tf.placeholder("float") # ReLU w_h_stddev = math.sqrt(2 / n_input) # Xavier w_out_stddev = math.sqrt(3 / (nn_hidden_layer_size + n_classes)) self._weights = { "h": tf.Variable(tf.truncated_normal([n_input, nn_hidden_layer_size], stddev=w_h_stddev), name="w_h"), "out": tf.Variable(tf.truncated_normal([nn_hidden_layer_size, n_classes], stddev=w_out_stddev), name="w_out") } self._biases = { "h": tf.Variable(tf.constant(0.1, shape=[nn_hidden_layer_size]), name="b_h"), "out": tf.Variable(tf.constant(0.0, shape=[n_classes]), name="b_out") } self._network = self._multilayer_perceptron(self._x, self._weights, self._biases) def _multilayer_perceptron(self, _X, _weights, _biases): input_layer_drop = tf.nn.dropout(_X, self._input_keep_prob) hidden_layer = tf.nn.relu(tf.add(tf.matmul(input_layer_drop, _weights["h"]), _biases["h"])) hidden_layer_drop = tf.nn.dropout(hidden_layer, self._hidden_keep_prob) return tf.matmul(hidden_layer_drop, _weights["out"]) + _biases["out"] def train(self, train_dir, model_dir): logging.info("Training network using " + train_dir) iter = DataSetIterator(self, train_dir, nn_batch_size) self._cat_embeddings = Embeddings(iter.cat_lexicon, w2v_layer_size, nn_embed_random_range, True) self._slot_embeddings = Embeddings(iter.slot_lexicon, w2v_layer_size, nn_embed_random_range, True) self._dist_embeddings = Embeddings(iter.dist_lexicon, w2v_layer_size, nn_embed_random_range, True) self._pos_embeddings = Embeddings(iter.pos_lexicon, w2v_layer_size, nn_embed_random_range, True) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self._network, self._y)) regularizers = tf.nn.l2_loss(self._weights["h"]) + tf.nn.l2_loss(self._weights["out"]) + tf.nn.l2_loss(self._biases["h"]) + tf.nn.l2_loss(self._biases["out"]) cost += nn_l2_reg * regularizers optimizer = tf.train.AdagradOptimizer(learning_rate=nn_learning_rate).minimize(cost) grads_wrt_input_op = tf.gradients(cost, self._x)[0] init = tf.initialize_all_variables() saver = tf.train.Saver(max_to_keep=0) with tf.Session() as sess: sess.run(init) for epoch in range(1, nn_epochs+1): logging.info("Training epoch " + str(epoch)) curr_batch = 1 sum_cost = 0 while True: next_batch = iter.next() if not next_batch: break batch_xs, batch_ys, records_in_batch = next_batch logging.info("Training batch " + str(epoch) + "/" + str(curr_batch)) _, grads_wrt_input = sess.run([optimizer, grads_wrt_input_op], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout}) logging.info("Network updated") for i in range(len(records_in_batch)): record = records_in_batch[i] grad_wrt_input = nn_learning_rate * grads_wrt_input[i] record.update_embeddings(grad_wrt_input, w2v_layer_size, self._cat_embeddings, self._slot_embeddings, self._dist_embeddings, self._pos_embeddings) logging.info("Embeddings updated") curr_cost = sess.run(cost, feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout}) logging.info("Cost: " + str(curr_cost)) curr_batch += 1 sum_cost += curr_cost logging.info("Epoch cost: " + str(sum_cost/float(curr_batch-1))) model_epoch_dir = model_dir + "/epoch" + str(epoch) if not os.path.exists(model_epoch_dir): os.makedirs(model_epoch_dir) self._serialize(saver, sess, model_epoch_dir) iter.reset() self._serialize(saver, sess, model_dir) logging.info("Network training complete") def test(self, test_dir, log_file, pos_thres, neg_thres): logging.info("Testing network using " + test_dir) iter = DataSetIterator(self, test_dir, 0) self._cat_embeddings = Embeddings(self._model_dir + "/cat.emb", w2v_layer_size, 0, False) self._slot_embeddings = Embeddings(self._model_dir + "/slot.emb", w2v_layer_size, 0, False) self._dist_embeddings = Embeddings(self._model_dir + "/dist.emb", w2v_layer_size, 0, False) self._pos_embeddings = Embeddings(self._model_dir + "/pos.emb", w2v_layer_size, 0, False) saver = tf.train.Saver() model_path = self._model_dir + "/model.out" with tf.Session() as sess: saver.restore(sess, model_path) batch_xs, batch_ys, records_in_batch = iter.next() logging.info("Number of test examples: " + str(len(records_in_batch))) correct_prediction = tf.equal(tf.argmax(self._network,1), tf.argmax(self._y,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) y_p = tf.argmax(self._network, 1) y_p_raw = tf.split(1, 2, tf.nn.softmax(self._network))[1] val_accuracy, y_network, y_network_raw = sess.run([accuracy, y_p, y_p_raw], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: 1.0, self._hidden_keep_prob: 1.0}) y_true = np.argmax(batch_ys, 1) logging.info("Accuracy: " + str(val_accuracy)) self._evaluate_thresholds(y_true, y_network_raw, pos_thres, neg_thres) with open(log_file+".classified1", "w") as out_correct, \ open(log_file+".classified0", "w") as out_incorrect: logging.info("Writing to files") for i in range(len(records_in_batch)): prediction = y_network[i] if prediction >= 0.5: out_correct.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n") else: out_incorrect.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n") logging.info("Network testing complete") def _evaluate_thresholds(self, y_true, y_network_raw, pos_thres, neg_thres): for j in range(5, 10): pos_threshold = j / float(10) neg_threshold = (10 - j) / float(10) self._evaluate_threshold(y_true, y_network_raw, pos_threshold, neg_threshold) self._evaluate_threshold(y_true, y_network_raw, pos_thres, neg_thres) def _evaluate_threshold(self, y_true, y_network_raw, pos_threshold, neg_threshold): sub_true = list() sub_network = list() for i in range(len(y_true)): # inverse logit prediction = y_network_raw[i] if prediction >= pos_threshold: sub_true.append(y_true[i]) sub_network.append(1) elif prediction <= neg_threshold: sub_true.append(y_true[i]) sub_network.append(0) logging.info("Evaluation threshold: " + str(pos_threshold) + ", " + str(neg_threshold)) sub_true.append(0) sub_network.append(0) sub_true.append(0) sub_network.append(1) sub_true.append(1) sub_network.append(0) sub_true.append(1) sub_network.append(1) confusion_matrix = sklearn.metrics.confusion_matrix(sub_true, sub_network) confusion_matrix -= 1 logging.info("Examples labeled as 0 classified by model as 0: " + str(confusion_matrix[0][0])) logging.info("Examples labeled as 0 classified by model as 1: " + str(confusion_matrix[0][1])) logging.info("Examples labeled as 1 classified by model as 0: " + str(confusion_matrix[1][0])) logging.info("Examples labeled as 1 classified by model as 1: " + str(confusion_matrix[1][1])) logging.info("") def _writeUTF(self, string): utf8 = string.encode("utf-8") length = len(utf8) return struct.pack("!H", length) + struct.pack("!" + str(length) + "s", utf8) def _serialize(self, saver, sess, model_dir): logging.info("Serializing network") saver.save(sess, model_dir + "/model.out") wh = self._weights["h"].eval().reshape((1,-1), order="F") wout = self._weights["out"].eval().reshape((1,-1), order="F") bh = self._biases["h"].eval().reshape((1,-1), order="F") bout = self._biases["out"].eval().reshape((1,-1), order="F") h = np.hstack((wh, bh, wout, bout)) if sys.byteorder == "little": h.byteswap(True) r, c = h.shape with open(model_dir + "/coeffs", "wb") as coeffs_file: coeffs_file.write(struct.pack("!i", 2)) coeffs_file.write(struct.pack("!i", r)) coeffs_file.write(struct.pack("!i", c)) coeffs_file.write(struct.pack("!i", 1)) coeffs_file.write(struct.pack("!i", 1)) coeffs_file.write(self._writeUTF("float")) coeffs_file.write(self._writeUTF("real")) coeffs_file.write(self._writeUTF("HEAP")) coeffs_file.write(struct.pack("!i", c)) coeffs_file.write(self._writeUTF("FLOAT")) with open(model_dir + "/coeffs", "ab") as coeffs_file: h.tofile(coeffs_file, "") logging.info("Serializing embeddings") self._cat_embeddings.serialize(model_dir + "/cat.emb") self._slot_embeddings.serialize(model_dir + "/slot.emb") self._dist_embeddings.serialize(model_dir + "/dist.emb") self._pos_embeddings.serialize(model_dir + "/pos.emb")
from gensim.models import Word2Vec from gensim.utils import simple_preprocess from keras.engine import Input from keras.layers import Embedding, merge from keras.models import Model from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import LSTM from keras.preprocessing import sequence from embeddings import Embeddings from keras.callbacks import ModelCheckpoint from nltk.tokenize import word_tokenize import random embeddings = Embeddings(100, 4, 1, 4) # getting data from preprocessing word2vec_weights = embeddings.get_weights() word2index, index2word = embeddings.get_vocabulary() word2vec_model = embeddings.get_model() tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences() # generating training data indow_size = 5 vocab_size = len(word2index) print(vocab_size) model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful" if not os.path.exists(model_weights_path): os.makedirs(model_weights_path)
from keras.callbacks import ModelCheckpoint from embeddings import Embeddings word_embedding_dimension = 100 word_embedding_window_size = 4 batch_size = 128 epochs = 10 window_size = 5 accuracy_threshold = 0.85 activation = 'relu' custom_accuracy = 0 loss_function = 'mse' model_name = 'POS_GRU ' + loss_function + "_"+ str(custom_accuracy) + "_" + activation + "_" + str(window_size) + "_" + str(batch_size) embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4) tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences() pos2index, index2pos = embeddings.get_pos_vocabulary() no_of_unique_tags = len(pos2index) seq_in = [] seq_out = [] # generating dataset for sentence in tokenized_pos_sentences: for i in range(len(sentence)-window_size-1): x = sentence[i:i + window_size] y = sentence[i + window_size] seq_in.append(x) seq_out.append(y)
def build_embedding(idxs=None, sequence_embeddings=None): return Embeddings(vocab.size(), opts.embedding_dim, idxs=idxs, sequence_embeddings=sequence_embeddings)
class NMT(nn.Module): def __init__(self, text, options, device): super(NMT, self).__init__() self.options = options self.embeddings = Embeddings(options.embed_size, text) self.hidden_size = options.hidden_size self.window_size_d = options.window_size_d self.text = text self.device = device self.encoder_layer = options.encoder_layer self.decoder_layers = options.decoder_layers self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False) self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False) self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False) self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False) self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False) self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False) def forward(self, source, target): len_ = [] for sen in source: len_.append(len(sen)) source_tensor = self.text.src.word2tensor(source, self.device).cuda() target_tensor = self.text.tar.word2tensor(target, self.device).cuda() encode_h, encode_len, encode_hn_cn = self.encode(source_tensor, len_) decode_out = self.decode(source_tensor, encode_hn_cn, encode_h, encode_len, target_tensor) P = nn.functional.log_softmax(self.ht2final(decode_out), dim=-1) # sen_len * batch * vocab_size tar_mask = (target_tensor != self.text.tar['<pad>']).float() tar_log_pro = torch.gather(P, index=target_tensor[1:].unsqueeze(-1), dim=-1).squeeze(-1) * tar_mask[1:] return tar_log_pro.sum(dim=0) def encode(self, source_tensor, source_length): x = self.embeddings.src(source_tensor) source_length_tensor = torch.tensor(source_length, dtype=torch.int64) x = pack_padded_sequence(x, source_length_tensor.cpu(), enforce_sorted=False) output, (hn, cn) = self.encoder(x) output, each_len = pad_packed_sequence(output) output = output.permute(1, 0, 2) return output, each_len, (hn, cn) def decode(self, source_tensor, h0_c0, encode_h, encode_len, target_tensor): y = self.embeddings.tar(target_tensor) ht_ct = h0_c0 ht = torch.zeros(encode_h.shape[0], self.hidden_size, device=self.device).cuda() output = [] for y_t in y: now_ht_ct, now_ht = self.step(source_tensor, encode_h, encode_len, torch.cat((y_t, ht), dim=1).view(1, y.shape[1], -1), ht_ct) output.append(now_ht) ht_ct = now_ht_ct ht = now_ht return torch.stack(output).to(self.device).cuda() # sen_len * batch * hidden_size #@profile def step(self, source, encode_h, encode_len, pre_yt, pre_ht_ct): ''' yt, ht_ct = self.decoder(pre_yt, pre_ht_ct) yt = torch.squeeze(yt, dim=0) pt = nn.functional.sigmoid(self.tan2pt(nn.functional.tanh(self.ht2tan(yt)))) batch_ct = None with torch.no_grad(): for i, each_pt in enumerate(pt): each_pt = encode_len[i].item() * each_pt.item() left = max(0, int(each_pt) - self.window_size_d) right = min(encode_len[i].item(), int(each_pt) + self.window_size_d) align = None for j in range(left, right): if (j == left): align = encode_h[i][j].view(1, -1) else: align = torch.cat((align, encode_h[i][j].view(1, -1)), dim=0) align = nn.functional.softmax(torch.squeeze(torch.bmm(yt[i].view(1, 1, -1), align.t().unsqueeze(dim=0)), dim=0).squeeze(dim=0)) ex_p = torch.zeros(right-left, dtype=torch.float16) for j in range(left, right): ex_p[j-left] = math.exp(-(j-each_pt)*(j-each_pt)/(self.window_size_d*self.window_size_d/2)) ex_p = ex_p.to(self.device).cuda() align = align.to(self.device).cuda() at = align * ex_p ct = torch.zeros(self.hidden_size, dtype=torch.float16) ct = ct.to(self.device).cuda() for j in range(left, right): ct += at[j-left]*encode_h[i][j] if (i == 0): batch_ct = torch.cat((ct.view(1, -1), yt[i].view(1, -1)), dim=1) else: batch_ct = torch.cat((batch_ct, torch.cat((ct.view(1, -1), yt[i].view(1, -1)), dim=1)), dim=0) #batch_ct = torch.zeros(pt.shape[0], self.hidden_size * 2, device=self.device) ht = nn.functional.tanh(self.ct2ht(batch_ct)) batch_ct = None return ht_ct, ht ''' encode_len = encode_len.cuda() yt, ht_ct = self.decoder(pre_yt, pre_ht_ct) yt = torch.squeeze(yt, dim=0) # batch * hidden_size batch_size = yt.shape[0] pt = torch.sigmoid(self.tan2pt(torch.tanh(self.ht2tan(yt)))).view(yt.shape[0]) * encode_len # batch pt = pt.view(batch_size, 1) # batch * 1 #with torch.no_grad(): # encode_h : batch * sen_len * hidden_size pre_align = torch.bmm(yt.view(batch_size, 1, self.hidden_size), torch.transpose(encode_h, 1, 2)).squeeze(dim=1) # batch * sen_len src_mask = (source == self.text.src['<pad>']).long().t() src_mask.cuda() #shuhe = torch.full((batch_size, encode_h.shape[1]), float("-inf"), dtype=torch.float, device=self.device) ''' shuhe = torch.zeros((batch_size, encode_h.shape[1]), dtype=float) for i in range(batch_size): shuhe[i][encode_len[i].item():] = float('-inf') ''' ''' for i in range(batch_size): pre_align[i][encode_len[i].item():] = float('-inf') ''' pre_align.data.masked_fill_(src_mask.bool(), float('-inf')) #sdz pre_align = pre_align - torch.tensor(shuhe, dtype=torch.float, device=self.device, requires_grad=False).reshape(batch_size, encode_h.shape[1]) align = nn.functional.softmax(pre_align, dim=-1) # batch * sen_len per_s = torch.arange(0, encode_h.shape[1], dtype=torch.long, device=self.device).view(1, encode_h.shape[1]).expand(batch_size, encode_h.shape[1]) at = align * torch.exp(-(torch.pow(per_s-pt, 2)/(self.window_size_d*self.window_size_d/2))) # batch * sen_len at = at.view(batch_size, -1, 1) pre_ct = at * encode_h # batch * sen_len * hidden_size ct = torch.cat((pre_ct.sum(dim=1), yt), dim=-1) ht = torch.tanh(self.ct2ht(ct)) return ht_ct, ht def beam_search(self, src, search_size, max_tar_length, test_batch_size): ''' src_tensor = self.text.src.word2tensor(src, self.device) all_h, encode_len, (h_n, c_n) = self.encode(src_tensor, [len(src)]) sen_len = all_h.shape[1] new_all_h = all_h for i in range(search_size-1): new_all_h = torch.cat((new_all_h, all_h), dim=0) all_h = new_all_h all_h = all_h.cuda() encode_len = [] for i in range(search_size): encode_len.append(len(src)) encode_len = torch.tensor(encode_len, dtype=torch.long, device=self.device) encode_len = encode_len.cuda() h_n = h_n.cuda() c_n = c_n.cuda() now_h = h_n now_c = c_n end_id = self.text.tar['<end>'] now_predict = [[self.text.tar['<start>']]] now_predict_words = [self.text.tar['<start>']] now_batch_word_tensor = torch.cat((self.embeddings.tar(torch.tensor([self.text.tar['<start>']], dtype=torch.long, device=self.device).cuda()), torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.device).cuda()), dim=-1).reshape(1, 1, -1) predict = [] now_predict_length = 0 while (len(predict) < search_size and now_predict_length < max_tar_length): now_predict_length += 1 next_ht_ct, next_ht = self.step(all_h[:len(now_predict)].reshape(len(now_predict), sen_len, -1), encode_len[:len(now_predict)], now_batch_word_tensor, (now_h, now_c)) now_h, now_c = next_ht_ct now_h = now_h.permute(1, 0, 2) now_c = now_c.permute(1, 0, 2) P = nn.functional.softmax(self.ht2final(next_ht), dim=-1) padding_score = None for i in range(len(now_predict_words)): if (i == 0): padding_score = P[i] else: padding_score = torch.cat((padding_score, P[i]), dim=-1) _, topk_index = torch.topk(padding_score, search_size) next_predict_words = [] next_predict = [] next_h = None next_c = None now_final_h = None for i in range(search_size): next_word_id = topk_index[i].item() % len(self.text.tar) batch_id = topk_index[i].item() // len(self.text.tar) now_sen = now_predict[batch_id] if (next_word_id == end_id): predict.append(now_sen[1:]) if (len(predict) == search_size): break continue next_predict_words.append(next_word_id) now_sen.append(next_word_id) next_predict.append(now_sen) if (next_h is None): next_h = now_h[batch_id].reshape(1, 4, -1) next_c = now_c[batch_id].reshape(1, 4, -1) now_final_h = next_ht[batch_id].reshape(1, -1) else: next_h = torch.cat((next_h, now_h[batch_id].reshape(1, 4, -1)), dim=0) next_c = torch.cat((next_c, now_c[batch_id].reshape(1, 4, -1)), dim=0) now_final_h = torch.cat((now_final_h, next_ht[batch_id].reshape(1, -1)), dim=0) if (len(predict) == search_size): break if (now_predict_length == max_tar_length): for sen in next_predict: predict.append(sen[1:]) if (len(predict) == search_size): break now_predict_words = next_predict_words now_predict = next_predict now_h = next_h.view(4, next_h.shape[0], -1).contiguous() now_c = next_c.view(4, next_c.shape[0], -1).contiguous() now_batch_word_tensor = torch.cat((self.embeddings.tar(torch.tensor(now_predict_words, dtype=torch.long, device=self.device)), now_final_h), dim=1) now_batch_word_tensor = now_batch_word_tensor.reshape(1, now_batch_word_tensor.shape[0], now_batch_word_tensor.shape[1]) return predict ''' encode_len = [] for i in range(test_batch_size): encode_len.append(len(src[i])) src_tensor = self.text.src.word2tensor(src, self.device) now_source = src_tensor all_h, encode_len, (h_n, c_n) = self.encode(src_tensor, encode_len) sen_len = all_h.shape[1] now_all_h = all_h encode_len = torch.tensor(encode_len, dtype=torch.long, device=self.device) encode_len = encode_len.cuda() now_encode_len = encode_len now_h = h_n now_c = c_n predict = [[] for _ in range(test_batch_size)] now_predict = [[0] for _ in range(test_batch_size)] now_batch_word_tensor = torch.cat((self.embeddings.tar(self.text.tar.word2tensor(now_predict, self.device)).squeeze(dim=0), torch.zeros(test_batch_size, self.hidden_size, dtype=torch.float, device=self.device).cuda()), dim=-1).reshape(1, test_batch_size, -1) now_predict_length = 0 now_score = torch.zeros(test_batch_size, dtype=torch.float, device=self.device).reshape(test_batch_size, 1) batch_index = [(i, 1) for i in range(test_batch_size)] while (now_predict_length < max_tar_length): now_predict_length += 1 next_ht_ct, next_ht = self.step(now_source, now_all_h, now_encode_len, now_batch_word_tensor.contiguous(), (now_h, now_c)) P = (nn.functional.softmax(self.ht2final(next_ht), dim=-1)+now_score).reshape(next_ht.shape[0]*len(self.text.tar)) next_batch_index = [] now_start = 0 next_predict = [] next_score = [] next_words = [] next_all_h = None next_encode_len = [] next_h = None next_c = None now_ht = None next_source = None now_source = src_tensor.t() now_h, now_c = next_ht_ct now_h = now_h.permute(1, 0, 2) now_c = now_c.permute(1, 0, 2) flag = False for key, value in batch_index: score, topk_index = torch.topk(P[len(self.text.tar)*now_start:len(self.text.tar)*(value+now_start)], search_size) next_value = 0 now_flag = False for i in range(search_size): next_word_id = topk_index[i].item() % len(self.text.tar) sent_id = topk_index[i].item() // len(self.text.tar) if (next_word_id == self.text.tar['<end>']): if (len(now_predict[now_start+sent_id][1:]) == 0): continue predict[key].append(((score[i].item()-now_score[now_start][0].item())/math.pow(len(now_predict[now_start+sent_id][1:]), config.alpha), now_predict[now_start+sent_id][1:].copy())) if (len(predict[key]) == search_size): now_flag = True break continue now_start += value if (now_flag): continue for i in range(search_size): next_word_id = topk_index[i].item() % len(self.text.tar) sent_id = topk_index[i].item() // len(self.text.tar) if (next_word_id == self.text.tar['<end>']): continue if (now_predict_length == max_tar_length): predict[key].append((score[i].item()/math.pow(len(now_predict[now_start-value+sent_id][1:])+1, config.alpha), now_predict[now_start-value+sent_id][1:].copy())) predict[key][-1][1].append(next_word_id) if (len(predict[key]) == search_size): now_flag = True break continue next_value += 1 next_predict.append(now_predict[now_start-value+sent_id].copy()) next_predict[-1].append(next_word_id) next_score.append(score[i].item()) next_words.append([next_word_id]) if (next_all_h is None): next_all_h = all_h[key].reshape(1, -1, self.hidden_size) next_encode_len.append(encode_len[key].item()) next_h = now_h[now_start-value+sent_id].reshape(1, 4, -1) next_c = now_c[now_start-value+sent_id].reshape(1, 4, -1) now_ht = next_ht[now_start-value+sent_id].reshape(1, -1) next_source = now_source[key].reshape(1, -1) else: next_all_h = torch.cat((next_all_h, all_h[key].reshape(1, sen_len, self.hidden_size)), dim=0) next_encode_len.append(encode_len[key].item()) next_h = torch.cat((next_h, now_h[now_start-value+sent_id].reshape(1, 4, -1)), dim=0) next_c = torch.cat((next_c, now_c[now_start-value+sent_id].reshape(1, 4, -1)), dim=0) now_ht = torch.cat((now_ht, next_ht[now_start-value+sent_id].reshape(1, -1)), dim=0) next_source = torch.cat((next_source, now_source[key].reshape(1, -1)), dim=0) if (now_flag): continue flag = True next_batch_index.append((key, next_value)) if (not flag): break now_source = next_source.t() now_score = torch.tensor(next_score, dtype=torch.float, device=self.device).reshape(-1, 1) now_all_h = next_all_h now_encode_len = torch.tensor(next_encode_len, dtype=torch.long, device=self.device) now_h = next_h.permute(1, 0, 2).contiguous() now_c = next_c.permute(1, 0, 2).contiguous() now_predict = next_predict batch_index = next_batch_index now_batch_word_tensor = torch.cat((self.embeddings.tar(self.text.tar.word2tensor(next_words, self.device)).squeeze(dim=0), now_ht), dim=-1).reshape(1, len(next_encode_len), -1) output = [] for sub in predict: sub = sorted(sub, key=lambda sc: sc[0], reverse=True) output.append(sub[0][1]) return output @staticmethod def load(model_path): params = torch.load(model_path, map_location=lambda storage, loc: storage) model = NMT(params['text'], params['options'], params['device']) model.load_state_dict(params['state_dict']) return model def save(self, model_path): print(f"save model to path [{model_path}]") params = { 'text': self.text, 'options': self.options, 'device': self.device, 'state_dict': self.state_dict() } torch.save(params, model_path)
def train(self, train_dir, model_dir): logging.info("Training network using " + train_dir) iter = DataSetIterator(self, train_dir, nn_batch_size) self._cat_embeddings = Embeddings(iter.cat_lexicon, w2v_layer_size, nn_embed_random_range, True) self._slot_embeddings = Embeddings(iter.slot_lexicon, w2v_layer_size, nn_embed_random_range, True) self._dist_embeddings = Embeddings(iter.dist_lexicon, w2v_layer_size, nn_embed_random_range, True) self._pos_embeddings = Embeddings(iter.pos_lexicon, w2v_layer_size, nn_embed_random_range, True) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self._network, self._y)) regularizers = tf.nn.l2_loss(self._weights["h"]) + tf.nn.l2_loss(self._weights["out"]) + tf.nn.l2_loss(self._biases["h"]) + tf.nn.l2_loss(self._biases["out"]) cost += nn_l2_reg * regularizers optimizer = tf.train.AdagradOptimizer(learning_rate=nn_learning_rate).minimize(cost) grads_wrt_input_op = tf.gradients(cost, self._x)[0] init = tf.initialize_all_variables() saver = tf.train.Saver(max_to_keep=0) with tf.Session() as sess: sess.run(init) for epoch in range(1, nn_epochs+1): logging.info("Training epoch " + str(epoch)) curr_batch = 1 sum_cost = 0 while True: next_batch = iter.next() if not next_batch: break batch_xs, batch_ys, records_in_batch = next_batch logging.info("Training batch " + str(epoch) + "/" + str(curr_batch)) _, grads_wrt_input = sess.run([optimizer, grads_wrt_input_op], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout}) logging.info("Network updated") for i in range(len(records_in_batch)): record = records_in_batch[i] grad_wrt_input = nn_learning_rate * grads_wrt_input[i] record.update_embeddings(grad_wrt_input, w2v_layer_size, self._cat_embeddings, self._slot_embeddings, self._dist_embeddings, self._pos_embeddings) logging.info("Embeddings updated") curr_cost = sess.run(cost, feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout}) logging.info("Cost: " + str(curr_cost)) curr_batch += 1 sum_cost += curr_cost logging.info("Epoch cost: " + str(sum_cost/float(curr_batch-1))) model_epoch_dir = model_dir + "/epoch" + str(epoch) if not os.path.exists(model_epoch_dir): os.makedirs(model_epoch_dir) self._serialize(saver, sess, model_epoch_dir) iter.reset() self._serialize(saver, sess, model_dir) logging.info("Network training complete")
def build_gru(name, idxs): embeddings = Embeddings(vocab_size, embedding_dim, idxs=idxs) return GruRnn(name, embedding_dim, hidden_dim, opts, update_fn, h0, embeddings.embeddings())
class Embedding_retrieval: def __init__(self): self.embeddings = Embeddings(path='Data/wordvectors.kv') with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp: self.document_frequencies = pickle.load(fp) with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp: self.term_frequencies = pickle.load(fp) with open('Data/ranking_dict/document_length_text.p', 'rb') as fp: self.document_length = pickle.load(fp) self.num_documents = len(self.term_frequencies) self.avg_length = mean(self.document_length.values()) def get_closest_sentence(self, query, id, doc, topk=3): k = 1.5 b = 0.75 weights = [] for term in re.findall(r"[\w']+|[.,!?;]", query): term = term.lower() if not term in self.document_frequencies: continue df = self.document_frequencies[term] idf = np.log((self.num_documents - df + 0.5) / (df + 0.5)) document_dict = self.term_frequencies[id] if not term in document_dict: weights.append(0) continue tf = document_dict[term] wd = ( (tf * (k + 1)) / (tf + k * (1 - b + b * self.document_length[id] / self.avg_length))) + 1 weights.append(idf * wd) query_embedding = self.weighted_embedding(query, weights) doc_embedding = [] tokenized_sent = tokenize.sent_tokenize(doc) for sent in tokenized_sent: try: doc_embedding.append(self.sent_embedding(sent)) except: print(sent) raise Exception('F**k off') scores = [] query_norm = np.linalg.norm(query_embedding) for i, emb in enumerate(doc_embedding): sent_norm = np.linalg.norm(emb) if sent_norm == 0: scores.append((i, 0)) else: scores.append( (i, np.dot(emb, query_embedding) / (sent_norm * query_norm))) scores = sorted(scores, key=lambda x: x[1], reverse=True) most_similar = [] for index, _ in scores[:topk]: most_similar.append(tokenized_sent[index]) return most_similar def weighted_embedding(self, query, weights): sum_weights = sum(weights) #weights = [w/sum_weights for w in weights] embeddings = [] for term in re.findall(r"[\w']+|[.,!?;]", query): term = term.lower() embeddings.append(self.embeddings.wv[term]) ones = np.ones(len(embeddings)) / len(embeddings) return np.dot(ones, embeddings) def sent_embedding(self, sentence): embeddings = None count = 0 for term in re.findall(r"[\w']+|[.,!?;]", sentence): term = term.lower() if term in self.embeddings.wv.vocab: if embeddings is None: embeddings = self.embeddings.get_embedding(term) else: embeddings = np.add(embeddings, self.embeddings.get_embedding(term)) count += 1 else: pass #print(term) if embeddings is None: #print('Embeddings none for sentence: {}'.format(sentence)) return np.zeros(100) return embeddings / count
def main(): sem_eval_data_dir = './data/semeval-2010-task-8' sem_eval_indices = [0, 1, 3, 5, 6, 7] train_words, train_starts, train_pos, train_link, train_dep, train_ent_labels = \ load_conll(os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT.all'), sem_eval_indices) train_starts = str_to_int(train_starts) train_link = str_to_int(train_link) train_rel_labels, train_pair_positions = load_relations( os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT')) train_branch1, train_branch2 = build_branches_indices( train_pair_positions, train_starts, train_link) test_words, test_starts, test_pos, test_link, test_dep, test_ent_labels = \ load_conll(os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT.all'), sem_eval_indices) test_starts = str_to_int(test_starts) test_link = str_to_int(test_link) test_rel_labels, test_pair_positions = load_relations( os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT')) test_branch1, test_branch2 = build_branches_indices( test_pair_positions, test_starts, test_link) rel_classes = sorted(set(train_rel_labels + train_rel_labels)) rel_to_index = {l: i for i, l in enumerate(rel_classes)} index_to_relation = {i: l for i, l in enumerate(rel_classes)} pos_classes = sorted( {l for sent_pos in train_pos + test_pos for l in sent_pos}) pos_to_index = build_labels_mapping(pos_classes) label_classes = sorted({ l for sent_labels in train_ent_labels + test_ent_labels for l in sent_labels }) label_to_index = build_labels_mapping(label_classes) index_to_label = build_indices_mapping(label_classes) dep_classes = sorted( {l for sent_dep in train_dep + test_dep for l in sent_dep}) dep_to_index = build_labels_mapping(dep_classes) word_set = {w for sent in train_words + test_words for w in sent} print(f'{len(word_set)} unique words found.') embed = Embeddings('./embeddings/eng/glove.6B.300d.txt', True, word_set=word_set) embed_matrix = embed.matrix train_inputs = make_rel_ext_inputs(train_words, embed, train_pos, pos_to_index, train_ent_labels, label_to_index, train_dep, dep_to_index, train_branch1, train_branch2) train_outputs = [[rel_to_index[l]] for l in train_rel_labels] test_inputs = make_rel_ext_inputs(test_words, embed, test_pos, pos_to_index, test_ent_labels, label_to_index, test_dep, dep_to_index, test_branch1, test_branch2) model = build_rel_ext_model(len(rel_classes), embed_matrix, len(label_classes), len(dep_classes), len(pos_classes)) train_generator = DataGenerator(train_inputs, (train_outputs, []), 32) evaluator = ModelEval(DataGenerator(test_inputs), test_rel_labels, index_to_relation) model_saver = ModelCheckpoint(filepath='./checkpoints/' + model.name.replace(' ', '_') + '_{epoch:02d}.hdf5', verbose=1, save_best_only=True, monitor='valid_f1', mode='max') time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") csv_logger = CSVLogger(f"./logs/RE_log_{time_stamp}.csv", append=False) #model.load_weights('./checkpoints/relation_classifier_20.hdf5') model.fit_generator(train_generator, epochs=20, callbacks=[evaluator, model_saver, csv_logger]) test_pred_indices = predict(model, DataGenerator(test_inputs))