def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy # The dimension of multi-head model is the same as the embedding. Is it must? attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def set_up(): if request.is_json: content = request.get_json() if content['key'] == 'fox': Loader().download_all_models() if content['key'] == 'snake': embedding_model = Embeddings() if content['key'] == 'sitara': Loader().download_all_models() embedding_model = Embeddings() return 'All data is downloaded'
def build_embeddings(self, embeddings_vec_path, *raw_datas, oov_as_unk=True, lower=True): """ Build Embeddings object that includes vector of words in data. Args: embeddings_vec_path (str): Path to the pretrained word vector file. Ex. FastText. raw_datas (list of dict): List of raw data **TOKENIZED** with tokenize_data load from json file. oov_as_unk (bool): Whether or not treat words not in pretrained word vectors set as OOVs. Otherwise, OOVs' embeddings will be randomly initialized. """ words = {} for raw_data in raw_datas: words = self._collect_words(raw_data, words) self.embeddings = Embeddings(embeddings_vec_path, words, oov_as_unk, lower=True) self.embeddings.add('<pad>', torch.tensor([0.] * self.embeddings.get_dim())) self.embeddings.add('<teacher>') self.embeddings.add('<student>') self.embeddings.add('CANNOTANSWER')
def choose_mnist(self): print "CHOSE MNIST" global predictor, autoencode_model, embeddings predictor = autoencode_predict.predict( name="meta-data/mnist/autoencode_model", color_depth=1) predictor.stop() predictor.restore() autoencode_model = predictor.autoencode_model embeddings = Embeddings(predictor) print "Loading images ..." if 'mnist' not in self.data_sets: print "Key missing. Building ImageData" imageData = LazyLoadWrapper( BatchWrapper( ResizeWrapper(ReshapeWrapper(Mnist(), [28, 28, 1]), [32, 32]))) imageData.getImages() self.data_sets['mnist'] = imageData print " mnist shape is", self.data_sets['mnist'].getImages().shape print "... loading images done" embeddings.data_set = self.data_sets['mnist'].getImages() return self.data_sets['mnist']
def __init__(self, text, args, device): super(NMT, self).__init__() self.text = text self.args = args self.device = device self.Embeddings = Embeddings(args['embed_size'], self.text) self.encoder_layer = nn.TransformerEncoderLayer( d_model=args['d_model'], nhead=args['nhead'], dim_feedforward=args['dim_feedforward'], dropout=args['dropout']) self.encoder_norm = nn.LayerNorm(args['d_model']) self.encoder = nn.TransformerEncoder( encoder_layer=self.encoder_layer, num_layers=args['num_encoder_layers'], norm=self.encoder_norm) self.decoder_layer = nn.TransformerDecoderLayer( d_model=args['d_model'], nhead=args['nhead'], dim_feedforward=args['dim_feedforward'], dropout=args['dropout']) self.decoder_norm = nn.LayerNorm(args['d_model']) self.decoder = nn.TransformerDecoder( decoder_layer=self.decoder_layer, num_layers=args['num_decoder_layers'], norm=self.decoder_norm) self.project = nn.Linear(args['d_model'], len(self.text.tar), bias=False) self.project.weight = self.Embeddings.tar.weight self.dropout = nn.Dropout(args['dropout']) self.project_value = math.pow(args['d_model'], 0.5) self.eps = args['smoothing_eps']
def test_word2vec_set(): embed = Embeddings('./data/word2vec.txt', True, word_set={'a', 'b', 'c'}) matrix = embed.matrix assert matrix.shape == (5, 3) assert len(embed.vocab) == 3 assert (matrix[embed['a']] == np.ones((1, ))).all() assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
def __init__(self, iT, corefs, model): self.iT = iT self.corefs = corefs self.embeddings = Embeddings(model) dist, components = self.computeProgression() self.distances = dist self.components = components
def __init__(self, X_train: list, Y_train: list, embed_path: str, embed_dim: int, stop_words=[], X_test=[], Y_test=[], max_len=None, epochs=3, batch_size=256): # Preprocessing the text X_train = [clean_text(text, stop_words=stop_words) for text in X_train] Y_train = np.asarray(Y_train) # Tokenizing the text tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) # Saving the tokenizer self.tokenizer = tokenizer # Creating the embedding matrix embedding = Embeddings(embed_path, embed_dim) embedding_matrix = embedding.create_embedding_matrix( tokenizer, len(tokenizer.word_counts)) # Creating the padded input for the deep learning model if max_len is None: max_len = np.max([len(text.split()) for text in X_train]) TextToTensor_instance = TextToTensor(tokenizer=tokenizer, max_len=max_len) X_train = TextToTensor_instance.string_to_tensor(X_train) # Creating the model rnn = RnnModel(embedding_matrix=embedding_matrix, embedding_dim=embed_dim, max_len=max_len) rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs) self.model = rnn.model # If X_test is provided we make predictions with the created model if len(X_test) > 0: X_test = [clean_text(text) for text in X_test] X_test = TextToTensor_instance.string_to_tensor(X_test) yhat = [x[0] for x in rnn.model.predict(X_test).tolist()] self.yhat = yhat # If true labels are provided we calculate the accuracy of the model if len(Y_test) > 0: self.acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat]) self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
def embedding(): if request.is_json: content = request.get_json() serializer = EmbeddingSerializer(data=content) if not serializer.is_valid(): return 'Error' text = serializer.text token = serializer.token vector = Embeddings().build_sentence_vector(text).tolist() data = json.dumps({"vector": vector, "token": token}) return data
def __init__(self, data_name, num_class=5): self.data_name = data_name self.train_data_path = '../data/' + self.data_name + '/train.txt' self.test_data_path = '../data/' + self.data_name + '/test.txt' self.dev_data_path = '../data/' + self.data_name + '/dev.txt' self.embeddings = Embeddings(data_name) self.num_class = num_class start_time = time.time() self.load_data() print('Reading datasets comsumes %.3f seconds' % (time.time() - start_time))
def get_pretrained_embeddings(path, vocab, method='word2vec'): emb = Embeddings() model = emb.load_model(method=method, model_path=path) embed_size = model.vector_size embeddings = np.zeros((len(vocab),embed_size)) oov_count = 0 for word in vocab: word_index = vocab[word] if word in model.vocab: embeddings[word_index] = model[word] else: oov_count += 1 print('OOV count: %i'%oov_count) return embeddings.astype('float32')
def __init__(self): self.embeddings = Embeddings(path='Data/wordvectors.kv') with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp: self.document_frequencies = pickle.load(fp) with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp: self.term_frequencies = pickle.load(fp) with open('Data/ranking_dict/document_length_text.p', 'rb') as fp: self.document_length = pickle.load(fp) self.num_documents = len(self.term_frequencies) self.avg_length = mean(self.document_length.values())
def __init__(self, text, options, device): super(NMT, self).__init__() self.options = options self.embeddings = Embeddings(options.embed_size, text) self.hidden_size = options.hidden_size self.window_size_d = options.window_size_d self.text = text self.device = device self.encoder_layer = options.encoder_layer self.decoder_layers = options.decoder_layers self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False) self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False) self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False) self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False) self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False) self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False)
def embedKG(self): self.logger.info("Embedding NP and relation phrases"); fname1 = self.p.out_path + self.p.file_entEmbed fname2 = self.p.out_path + self.p.file_relEmbed if not checkFile(fname1) or not checkFile(fname2): embed = Embeddings(self.p, self.side_info, self.logger) embed.fit() self.ent2embed = embed.ent2embed # Get the learned NP embeddings self.rel2embed = embed.rel2embed # Get the learned RP embeddings pickle.dump(self.ent2embed, open(fname1, 'wb')) pickle.dump(self.rel2embed, open(fname2, 'wb')) else: self.logger.info('\tLoading cached Embeddings') self.ent2embed = pickle.load(open(fname1, 'rb')) self.rel2embed = pickle.load(open(fname2, 'rb'))
def load_embeddings_vocab(self): pretrained_embeddings = Embeddings() # read filtered embeddings if not tf.gfile.Exists(config.filtered_embeddings_path): word_to_vec = pretrained_embeddings.load_universal_embeddings() self.create_vocabulary( self.vocab_file, pretrained_embeddings.all_words(word_to_vec), tokenizer=None) word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file) filtered_embeddings = pretrained_embeddings.filter_vocab_embeddings( word_to_vec, word_to_idx.keys()) with open(config.filtered_embeddings_path, 'wb') as output_file: pickle.dump(filtered_embeddings, output_file, protocol=pickle.HIGHEST_PROTOCOL) else: word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file) word_prob = self.read_unigram_freq(self.unigram_prob_file) assert 1.01 > sum( [0 if val is None else val for val in word_prob.values()]) > 0.99, "What?!" pre_embs_dict, embd_dim = pretrained_embeddings.load_filtered_pretrained_embeddings( config.filtered_embeddings_path) word_vec = pretrained_embeddings.get_embedding_matrix( pre_embs_dict, word_to_idx, embd_dim) self.word_vec = word_vec self.word_prob = word_prob self.word_to_idx = word_to_idx self.idx_to_word = idx_to_word train_path = os.path.join(config.data_dir, config.data_files['train']) dev_path = os.path.join(config.data_dir, config.data_files['dev']) self.write_data_to_token_ids(train_path, target_path=train_path) self.write_data_to_token_ids(dev_path, target_path=dev_path)
def choose_garden(self): print "CHOSE GARDEN" global predictor, autoencode_model, embeddings predictor = autoencode_predict.predict( name="meta-data/garden/garden_model", color_depth=3) predictor.stop() predictor.restore() autoencode_model = predictor.autoencode_model embeddings = Embeddings(predictor) config_data = json.load(open("data/file_data.json", "r")) print "Loading images ..." if 'garden' not in self.data_sets: print "Key missing. Building ImageData" print "Loading files ...", files = LazyLoadWrapper( ResizeWrapper( FileReader(config_data["file_names"], config_data["labels"]), [64, 64])) files.init() print "done." print "Calculating full size ...", full_size = LazyLoadWrapper(ResizeWrapper(files, [32, 32])) full_size.init() print "done." print "Calculating half size ...", half_size = LazyLoadWrapper(SliceWrapper(files, 32, 16)) half_size.init() print "done." print "Calculating concat the whole thing ...", self.data_sets['garden'] = LazyLoadWrapper( BatchWrapper(ConcatWrapper([full_size, half_size]))) print "done." self.data_sets['garden'].getImages() print " garden shape is", self.data_sets['garden'].getImages().shape print "... loading images done" embeddings.data_set = self.data_sets['garden'].getImages() return self.data_sets['garden']
def embeddings(args): kf = KFold(n_splits=args.splits_num, shuffle=args.shuffle, random_state=42) score_lst = list() for fold, (train_index, valid_index) in enumerate(kf.split(users)): train_users = users[train_index] train_movies = movies[train_index] train_ratings = ratings[train_index] valid_users = users[valid_index] valid_movies = movies[valid_index] valid_ratings = ratings[valid_index] model = Embeddings( number_of_users, number_of_movies, embeddings_size=args.embeddings_size, dropout_embeddings=args.embeddings_dropout_embeddings, dropout=args.embeddings_dropout) model.fit(train_users, train_movies, train_ratings, valid_users=valid_users, valid_movies=valid_movies, valid_ratings=valid_ratings, epochs=args.embeddings_num_epochs, verbose=args.verbose, decay=args.embeddings_decay, decay_steps=args.embeddings_decay_steps, learning_rate=args.embeddings_learning_rate, batch_size=args.embeddings_batch_size) preds = model.predict(valid_users, valid_movies) score = root_mean_square_error(valid_ratings, preds) score_lst.append(score) print("Fold:", fold + 1, "score:", score) print('Mean CV RMSE:', np.mean(score_lst))
def prepare_word_embeddings(query_lang_emb, qlang_long, doc_lang_emb, dlang_long, limit_emb, normalize=False, processes=40): """ Creates Word Embedding Helper Object :param query_lang_emb: language of queries :param qlang_long: short version :param doc_lang_emb: language of documents :param dlang_long: short version :param limit_emb: load only first n embeddings :param normalize: transform to unit vectors :param processes: number of parallel workers :return: """ embeddings = Embeddings() embeddings.load_embeddings(query_lang_emb, processes=processes, language=qlang_long, limit=limit_emb, normalize=normalize) embeddings.load_embeddings(doc_lang_emb, processes=processes, language=dlang_long, limit=limit_emb, normalize=normalize) return embeddings
def __init__(self, text, model='Word2Vec'): self.model = Embeddings(model) self.model.fit_corpus(text) self.model.train()
from gensim.models import Word2Vec from gensim.utils import simple_preprocess from keras.engine import Input from keras.layers import Embedding, merge from keras.models import Model from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import LSTM from keras.preprocessing import sequence from embeddings import Embeddings from keras.callbacks import ModelCheckpoint from nltk.tokenize import word_tokenize import random embeddings = Embeddings(100, 4, 1, 4) # getting data from preprocessing word2vec_weights = embeddings.get_weights() word2index, index2word = embeddings.get_vocabulary() word2vec_model = embeddings.get_model() tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences() # generating training data indow_size = 5 vocab_size = len(word2index) print(vocab_size) model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful" if not os.path.exists(model_weights_path): os.makedirs(model_weights_path)
from keras.callbacks import ModelCheckpoint from embeddings import Embeddings word_embedding_dimension = 100 word_embedding_window_size = 4 batch_size = 128 epochs = 10 window_size = 5 accuracy_threshold = 0.85 activation = 'relu' custom_accuracy = 0 loss_function = 'mse' model_name = 'POS_GRU ' + loss_function + "_"+ str(custom_accuracy) + "_" + activation + "_" + str(window_size) + "_" + str(batch_size) embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4) tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences() pos2index, index2pos = embeddings.get_pos_vocabulary() no_of_unique_tags = len(pos2index) seq_in = [] seq_out = [] # generating dataset for sentence in tokenized_pos_sentences: for i in range(len(sentence)-window_size-1): x = sentence[i:i + window_size] y = sentence[i + window_size] seq_in.append(x) seq_out.append(y)
def build_embedding(idxs=None, sequence_embeddings=None): return Embeddings(vocab.size(), opts.embedding_dim, idxs=idxs, sequence_embeddings=sequence_embeddings)
# model_name = 'places365vgg' # layer_name = 'prob' # model_path = './mynet.npy' model_name = 'places365resnetft' layer_name = 'bn5a_branch2c' model_path = './resnet365ft.npy' # model_name = 'places365resnet' # layer_name = 'prob' # model_path = './resnet365.npy' path1 = "/run/user/1000/gvfs/sftp:host=10.2.36.75,user=anjan/tmp/anjan/2014-05-06-12-54-54/stereo/centre_corrected/" path2 = "/run/user/1000/gvfs/sftp:host=10.2.36.75,user=anjan/tmp/anjan/2014-05-06-12-54-54/mono_rear_corrected/" clf = Embeddings(model_name, layer_name, model_path) imagenames1 = [f for f in listdir(path1)] imagenames2 = [f for f in listdir(path2)] imagenames1.sort() imagenames2.sort() images1 = [join(path1, i) for i in imagenames1] images2 = [join(path2, i) for i in imagenames2] temp1 = [[imagenames1[i], imagenames2[i]] for i in range(25)] temp = [[ int(imagenames1[i][:-4]) - int(imagenames1[i - 1][:-4]), int(imagenames2[i][:-4]) - int(imagenames2[i - 1][:-4]) ] for i in range(1, 50)]
def test_word2vec_size(): embed = Embeddings('./data/word2vec.txt', True, 4) matrix = embed.matrix assert (matrix[embed['a']] == np.ones((1, ))).all() assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
def main(): sem_eval_data_dir = './data/semeval-2010-task-8' sem_eval_indices = [0, 1, 3, 5, 6, 7] train_words, train_starts, train_pos, train_link, train_dep, train_ent_labels = \ load_conll(os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT.all'), sem_eval_indices) train_starts = str_to_int(train_starts) train_link = str_to_int(train_link) train_rel_labels, train_pair_positions = load_relations( os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT')) train_branch1, train_branch2 = build_branches_indices( train_pair_positions, train_starts, train_link) test_words, test_starts, test_pos, test_link, test_dep, test_ent_labels = \ load_conll(os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT.all'), sem_eval_indices) test_starts = str_to_int(test_starts) test_link = str_to_int(test_link) test_rel_labels, test_pair_positions = load_relations( os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT')) test_branch1, test_branch2 = build_branches_indices( test_pair_positions, test_starts, test_link) rel_classes = sorted(set(train_rel_labels + train_rel_labels)) rel_to_index = {l: i for i, l in enumerate(rel_classes)} index_to_relation = {i: l for i, l in enumerate(rel_classes)} pos_classes = sorted( {l for sent_pos in train_pos + test_pos for l in sent_pos}) pos_to_index = build_labels_mapping(pos_classes) label_classes = sorted({ l for sent_labels in train_ent_labels + test_ent_labels for l in sent_labels }) label_to_index = build_labels_mapping(label_classes) index_to_label = build_indices_mapping(label_classes) dep_classes = sorted( {l for sent_dep in train_dep + test_dep for l in sent_dep}) dep_to_index = build_labels_mapping(dep_classes) word_set = {w for sent in train_words + test_words for w in sent} print(f'{len(word_set)} unique words found.') embed = Embeddings('./embeddings/eng/glove.6B.300d.txt', True, word_set=word_set) embed_matrix = embed.matrix train_inputs = make_rel_ext_inputs(train_words, embed, train_pos, pos_to_index, train_ent_labels, label_to_index, train_dep, dep_to_index, train_branch1, train_branch2) train_outputs = [[rel_to_index[l]] for l in train_rel_labels] test_inputs = make_rel_ext_inputs(test_words, embed, test_pos, pos_to_index, test_ent_labels, label_to_index, test_dep, dep_to_index, test_branch1, test_branch2) model = build_rel_ext_model(len(rel_classes), embed_matrix, len(label_classes), len(dep_classes), len(pos_classes)) train_generator = DataGenerator(train_inputs, (train_outputs, []), 32) evaluator = ModelEval(DataGenerator(test_inputs), test_rel_labels, index_to_relation) model_saver = ModelCheckpoint(filepath='./checkpoints/' + model.name.replace(' ', '_') + '_{epoch:02d}.hdf5', verbose=1, save_best_only=True, monitor='valid_f1', mode='max') time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") csv_logger = CSVLogger(f"./logs/RE_log_{time_stamp}.csv", append=False) #model.load_weights('./checkpoints/relation_classifier_20.hdf5') model.fit_generator(train_generator, epochs=20, callbacks=[evaluator, model_saver, csv_logger]) test_pred_indices = predict(model, DataGenerator(test_inputs))
import numpy as np import label_predict from embeddings import Embeddings import nearest_neighbour from data_source import LazyLoadWrapper, ResizeWrapper, ReshapeWrapper, Mnist embeddings = Embeddings() imageData = LazyLoadWrapper( ResizeWrapper(ReshapeWrapper(Mnist(False), [28, 28, 1]), [32, 32])) embeddings.data_set = imageData.getImages() number = 4 print "The number is", imageData.getLabels()[number] nearest = nearest_neighbour.byIndex(number, embeddings.getEmbeddings(), size=200) result = zip(imageData.getLabels()[nearest], nearest) nearest = [] negative_examples = [] for label, data_index in result: label = np.argmax(label) if label == 4: nearest.append(data_index) else: negative_examples.append(data_index) print "Pretend labeling the first", len(nearest), " ..."
words = [] class MyCorpusTest(MyCorpus): def getMapX(self, line, index): if (index == self.startToken): words.append(line[1]) return super(MyCorpusTest, self).getMapX(line, index) def getMapY(self, line, index): return [] emb = Embeddings( fname="", # "/data/wordembeddings/cc.en.300.vec", ws="http://127.0.0.1:8023/wordvectors_get?w1={}", unknownStore="unknown.300.vec", embSize=300) print("Loading saved model") json_file = open('model/model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) print("Loading weights") model.load_weights("model/model.h5") print("Loaded model from disk") for corpus in ["corp", "equi", "wind"]: print("Processing corpus %s" % (corpus))
# sanity check for decoder batch_size = 64 hidden_dim = 512 input_dim = 100 output_dim = 100 max_length = 100 num_heads = 8 inner_dim = 1024 dropout = 0.1 num_layers = 12 pad_id = 0 seq_len = 100 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') embedding_layer = Embeddings(input_dim, hidden_dim, max_length, device, static=False) decoder_layer = DecoderLayer(hidden_dim, num_heads, inner_dim, dropout) decoder = Decoder(output_dim, hidden_dim, embedding_layer, decoder_layer, num_layers, dropout) source = torch.LongTensor(batch_size, seq_len).random_(input_dim) source_mask = (source != pad_id).unsqueeze(1).unsqueeze(2) target = torch.LongTensor(batch_size, seq_len).random_(input_dim) target_mask = (target != pad_id).unsqueeze(1).unsqueeze(2) encoded_source = torch.rand(batch_size, seq_len, hidden_dim) output = decoder(target, target_mask, encoded_source, source_mask)
def test_fast_text_fixed_size(): embed = Embeddings('./data/fast_text.txt', True, 4, is_fast_text=True) matrix = embed.matrix assert (matrix[embed['a']] == np.ones((1, ))).all() assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
def test_word2vec_case_sensitive(): embed = Embeddings('./data/w2v_case_sensitive.txt', False, 4) matrix = embed.matrix assert (matrix[embed['A']] == np.ones((1, ))).all() assert (matrix[embed['a']] == np.ones((1, )) * 2).all()