Esempio n. 1
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    # The dimension of multi-head model is the same as the embedding. Is it must?
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
Esempio n. 2
0
def set_up():
    if request.is_json:
        content = request.get_json()
        if content['key'] == 'fox':
            Loader().download_all_models()
        if content['key'] == 'snake':
            embedding_model = Embeddings()
        if content['key'] == 'sitara':
            Loader().download_all_models()
            embedding_model = Embeddings()
        return 'All data is downloaded'
Esempio n. 3
0
    def build_embeddings(self,
                         embeddings_vec_path,
                         *raw_datas,
                         oov_as_unk=True,
                         lower=True):
        """ Build Embeddings object that includes vector of words in data.

        Args:
            embeddings_vec_path (str): Path to the pretrained word vector file.
                Ex. FastText.
            raw_datas (list of dict): List of raw data **TOKENIZED** with
                tokenize_data load from json file.
            oov_as_unk (bool): Whether or not treat words not in pretrained
                word vectors set as OOVs. Otherwise, OOVs' embeddings will be
                randomly initialized.
        """
        words = {}
        for raw_data in raw_datas:
            words = self._collect_words(raw_data, words)

        self.embeddings = Embeddings(embeddings_vec_path,
                                     words,
                                     oov_as_unk,
                                     lower=True)
        self.embeddings.add('<pad>',
                            torch.tensor([0.] * self.embeddings.get_dim()))
        self.embeddings.add('<teacher>')
        self.embeddings.add('<student>')
        self.embeddings.add('CANNOTANSWER')
Esempio n. 4
0
    def choose_mnist(self):
        print "CHOSE MNIST"
        global predictor, autoencode_model, embeddings

        predictor = autoencode_predict.predict(
            name="meta-data/mnist/autoencode_model", color_depth=1)
        predictor.stop()
        predictor.restore()
        autoencode_model = predictor.autoencode_model
        embeddings = Embeddings(predictor)

        print "Loading images ..."
        if 'mnist' not in self.data_sets:
            print "Key missing.  Building ImageData"
            imageData = LazyLoadWrapper(
                BatchWrapper(
                    ResizeWrapper(ReshapeWrapper(Mnist(), [28, 28, 1]),
                                  [32, 32])))
            imageData.getImages()
            self.data_sets['mnist'] = imageData

        print "  mnist shape is", self.data_sets['mnist'].getImages().shape
        print "... loading images done"
        embeddings.data_set = self.data_sets['mnist'].getImages()
        return self.data_sets['mnist']
Esempio n. 5
0
 def __init__(self, text, args, device):
     super(NMT, self).__init__()
     self.text = text
     self.args = args
     self.device = device
     self.Embeddings = Embeddings(args['embed_size'], self.text)
     self.encoder_layer = nn.TransformerEncoderLayer(
         d_model=args['d_model'],
         nhead=args['nhead'],
         dim_feedforward=args['dim_feedforward'],
         dropout=args['dropout'])
     self.encoder_norm = nn.LayerNorm(args['d_model'])
     self.encoder = nn.TransformerEncoder(
         encoder_layer=self.encoder_layer,
         num_layers=args['num_encoder_layers'],
         norm=self.encoder_norm)
     self.decoder_layer = nn.TransformerDecoderLayer(
         d_model=args['d_model'],
         nhead=args['nhead'],
         dim_feedforward=args['dim_feedforward'],
         dropout=args['dropout'])
     self.decoder_norm = nn.LayerNorm(args['d_model'])
     self.decoder = nn.TransformerDecoder(
         decoder_layer=self.decoder_layer,
         num_layers=args['num_decoder_layers'],
         norm=self.decoder_norm)
     self.project = nn.Linear(args['d_model'],
                              len(self.text.tar),
                              bias=False)
     self.project.weight = self.Embeddings.tar.weight
     self.dropout = nn.Dropout(args['dropout'])
     self.project_value = math.pow(args['d_model'], 0.5)
     self.eps = args['smoothing_eps']
Esempio n. 6
0
def test_word2vec_set():
    embed = Embeddings('./data/word2vec.txt', True, word_set={'a', 'b', 'c'})
    matrix = embed.matrix
    assert matrix.shape == (5, 3)
    assert len(embed.vocab) == 3
    assert (matrix[embed['a']] == np.ones((1, ))).all()
    assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
Esempio n. 7
0
    def __init__(self, iT, corefs, model):
        self.iT = iT
        self.corefs = corefs
        self.embeddings = Embeddings(model)

        dist, components = self.computeProgression()

        self.distances = dist
        self.components = components
Esempio n. 8
0
    def __init__(self,
                 X_train: list,
                 Y_train: list,
                 embed_path: str,
                 embed_dim: int,
                 stop_words=[],
                 X_test=[],
                 Y_test=[],
                 max_len=None,
                 epochs=3,
                 batch_size=256):

        # Preprocessing the text
        X_train = [clean_text(text, stop_words=stop_words) for text in X_train]
        Y_train = np.asarray(Y_train)

        # Tokenizing the text
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        # Saving the tokenizer
        self.tokenizer = tokenizer

        # Creating the embedding matrix
        embedding = Embeddings(embed_path, embed_dim)
        embedding_matrix = embedding.create_embedding_matrix(
            tokenizer, len(tokenizer.word_counts))

        # Creating the padded input for the deep learning model
        if max_len is None:
            max_len = np.max([len(text.split()) for text in X_train])
        TextToTensor_instance = TextToTensor(tokenizer=tokenizer,
                                             max_len=max_len)
        X_train = TextToTensor_instance.string_to_tensor(X_train)

        # Creating the model
        rnn = RnnModel(embedding_matrix=embedding_matrix,
                       embedding_dim=embed_dim,
                       max_len=max_len)
        rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs)

        self.model = rnn.model

        # If X_test is provided we make predictions with the created model
        if len(X_test) > 0:
            X_test = [clean_text(text) for text in X_test]
            X_test = TextToTensor_instance.string_to_tensor(X_test)
            yhat = [x[0] for x in rnn.model.predict(X_test).tolist()]

            self.yhat = yhat

            # If true labels are provided we calculate the accuracy of the model
            if len(Y_test) > 0:
                self.acc = accuracy_score(Y_test,
                                          [1 if x > 0.5 else 0 for x in yhat])
                self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
Esempio n. 9
0
def embedding():
    if request.is_json:
        content = request.get_json()
        serializer = EmbeddingSerializer(data=content)
        if not serializer.is_valid():
            return 'Error'
        text = serializer.text
        token = serializer.token
        vector = Embeddings().build_sentence_vector(text).tolist()
        data = json.dumps({"vector": vector, "token": token})
        return data
Esempio n. 10
0
 def __init__(self, data_name, num_class=5):
     self.data_name = data_name
     self.train_data_path = '../data/' + self.data_name + '/train.txt'
     self.test_data_path = '../data/' + self.data_name + '/test.txt'
     self.dev_data_path = '../data/' + self.data_name + '/dev.txt'
     self.embeddings = Embeddings(data_name)
     self.num_class = num_class
     start_time = time.time()
     self.load_data()
     print('Reading datasets comsumes %.3f seconds' %
           (time.time() - start_time))
Esempio n. 11
0
def get_pretrained_embeddings(path, vocab, method='word2vec'):
    emb = Embeddings()
    model = emb.load_model(method=method, model_path=path)
    embed_size = model.vector_size
    embeddings = np.zeros((len(vocab),embed_size))
    oov_count = 0
    for word in vocab:
        word_index = vocab[word]
        if word in model.vocab:
            embeddings[word_index] = model[word]
        else:
            oov_count += 1
    print('OOV count: %i'%oov_count)
    return embeddings.astype('float32')
Esempio n. 12
0
    def __init__(self):

        self.embeddings = Embeddings(path='Data/wordvectors.kv')

        with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp:
            self.document_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp:
            self.term_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/document_length_text.p', 'rb') as fp:
            self.document_length = pickle.load(fp)

        self.num_documents = len(self.term_frequencies)
        self.avg_length = mean(self.document_length.values())
Esempio n. 13
0
    def __init__(self, text, options, device):
        super(NMT, self).__init__()
        self.options = options
        self.embeddings = Embeddings(options.embed_size, text)
        self.hidden_size = options.hidden_size
        self.window_size_d = options.window_size_d
        self.text = text
        self.device = device
        self.encoder_layer = options.encoder_layer 
        self.decoder_layers = options.decoder_layers

        self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False)
        self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False)
        self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False)
        self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False)
Esempio n. 14
0
	def embedKG(self):
		self.logger.info("Embedding NP and relation phrases");

		fname1 = self.p.out_path + self.p.file_entEmbed
		fname2 = self.p.out_path + self.p.file_relEmbed

		if not checkFile(fname1) or not checkFile(fname2):
			embed = Embeddings(self.p, self.side_info, self.logger)
			embed.fit()

			self.ent2embed = embed.ent2embed			# Get the learned NP embeddings
			self.rel2embed = embed.rel2embed			# Get the learned RP embeddings

			pickle.dump(self.ent2embed, open(fname1, 'wb'))
			pickle.dump(self.rel2embed, open(fname2, 'wb'))
		else:
			self.logger.info('\tLoading cached Embeddings')
			self.ent2embed = pickle.load(open(fname1, 'rb'))
			self.rel2embed = pickle.load(open(fname2, 'rb'))
Esempio n. 15
0
    def load_embeddings_vocab(self):
        pretrained_embeddings = Embeddings()

        # read filtered embeddings
        if not tf.gfile.Exists(config.filtered_embeddings_path):
            word_to_vec = pretrained_embeddings.load_universal_embeddings()

            self.create_vocabulary(
                self.vocab_file,
                pretrained_embeddings.all_words(word_to_vec),
                tokenizer=None)
            word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file)

            filtered_embeddings = pretrained_embeddings.filter_vocab_embeddings(
                word_to_vec, word_to_idx.keys())

            with open(config.filtered_embeddings_path, 'wb') as output_file:
                pickle.dump(filtered_embeddings,
                            output_file,
                            protocol=pickle.HIGHEST_PROTOCOL)

        else:
            word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file)
            word_prob = self.read_unigram_freq(self.unigram_prob_file)
            assert 1.01 > sum(
                [0 if val is None else val
                 for val in word_prob.values()]) > 0.99, "What?!"

        pre_embs_dict, embd_dim = pretrained_embeddings.load_filtered_pretrained_embeddings(
            config.filtered_embeddings_path)
        word_vec = pretrained_embeddings.get_embedding_matrix(
            pre_embs_dict, word_to_idx, embd_dim)

        self.word_vec = word_vec
        self.word_prob = word_prob
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word

        train_path = os.path.join(config.data_dir, config.data_files['train'])
        dev_path = os.path.join(config.data_dir, config.data_files['dev'])
        self.write_data_to_token_ids(train_path, target_path=train_path)
        self.write_data_to_token_ids(dev_path, target_path=dev_path)
Esempio n. 16
0
    def choose_garden(self):
        print "CHOSE GARDEN"
        global predictor, autoencode_model, embeddings

        predictor = autoencode_predict.predict(
            name="meta-data/garden/garden_model", color_depth=3)
        predictor.stop()
        predictor.restore()
        autoencode_model = predictor.autoencode_model
        embeddings = Embeddings(predictor)
        config_data = json.load(open("data/file_data.json", "r"))

        print "Loading images ..."
        if 'garden' not in self.data_sets:
            print "Key missing.  Building ImageData"

            print "Loading files ...",
            files = LazyLoadWrapper(
                ResizeWrapper(
                    FileReader(config_data["file_names"],
                               config_data["labels"]), [64, 64]))
            files.init()
            print "done."
            print "Calculating full size ...",
            full_size = LazyLoadWrapper(ResizeWrapper(files, [32, 32]))
            full_size.init()
            print "done."
            print "Calculating half size ...",
            half_size = LazyLoadWrapper(SliceWrapper(files, 32, 16))
            half_size.init()
            print "done."
            print "Calculating concat the whole thing ...",
            self.data_sets['garden'] = LazyLoadWrapper(
                BatchWrapper(ConcatWrapper([full_size, half_size])))
            print "done."

            self.data_sets['garden'].getImages()

        print "  garden shape is", self.data_sets['garden'].getImages().shape
        print "... loading images done"
        embeddings.data_set = self.data_sets['garden'].getImages()
        return self.data_sets['garden']
Esempio n. 17
0
def embeddings(args):
    kf = KFold(n_splits=args.splits_num, shuffle=args.shuffle, random_state=42)

    score_lst = list()

    for fold, (train_index, valid_index) in enumerate(kf.split(users)):
        train_users = users[train_index]
        train_movies = movies[train_index]
        train_ratings = ratings[train_index]

        valid_users = users[valid_index]
        valid_movies = movies[valid_index]
        valid_ratings = ratings[valid_index]

        model = Embeddings(
            number_of_users,
            number_of_movies,
            embeddings_size=args.embeddings_size,
            dropout_embeddings=args.embeddings_dropout_embeddings,
            dropout=args.embeddings_dropout)

        model.fit(train_users,
                  train_movies,
                  train_ratings,
                  valid_users=valid_users,
                  valid_movies=valid_movies,
                  valid_ratings=valid_ratings,
                  epochs=args.embeddings_num_epochs,
                  verbose=args.verbose,
                  decay=args.embeddings_decay,
                  decay_steps=args.embeddings_decay_steps,
                  learning_rate=args.embeddings_learning_rate,
                  batch_size=args.embeddings_batch_size)

        preds = model.predict(valid_users, valid_movies)

        score = root_mean_square_error(valid_ratings, preds)
        score_lst.append(score)

        print("Fold:", fold + 1, "score:", score)

    print('Mean CV RMSE:', np.mean(score_lst))
Esempio n. 18
0
def prepare_word_embeddings(query_lang_emb, qlang_long,
                            doc_lang_emb, dlang_long,
                            limit_emb, normalize=False, processes=40):
    """
    Creates Word Embedding Helper Object
    :param query_lang_emb: language of queries
    :param qlang_long: short version
    :param doc_lang_emb: language of documents
    :param dlang_long: short version
    :param limit_emb: load only first n embeddings
    :param normalize: transform to unit vectors
    :param processes: number of parallel workers
    :return:
    """
    embeddings = Embeddings()
    embeddings.load_embeddings(query_lang_emb, processes=processes, language=qlang_long,
                               limit=limit_emb, normalize=normalize)
    embeddings.load_embeddings(doc_lang_emb, processes=processes, language=dlang_long,
                               limit=limit_emb, normalize=normalize)
    return embeddings
Esempio n. 19
0
 def __init__(self, text, model='Word2Vec'):
     self.model = Embeddings(model)
     self.model.fit_corpus(text)
     self.model.train()
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint
from nltk.tokenize import word_tokenize
import random

embeddings = Embeddings(100, 4, 1, 4)

# getting data from preprocessing
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

# generating training data
indow_size = 5
vocab_size = len(word2index)
print(vocab_size)

model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
from keras.callbacks import ModelCheckpoint
from embeddings import Embeddings

word_embedding_dimension = 100
word_embedding_window_size = 4
batch_size = 128 
epochs = 10 
window_size = 5 
accuracy_threshold = 0.85
activation = 'relu' 
custom_accuracy = 0
loss_function = 'mse' 

model_name = 'POS_GRU ' + loss_function + "_"+ str(custom_accuracy) + "_" + activation + "_" + str(window_size) + "_" + str(batch_size)

embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)
tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences()
pos2index, index2pos = embeddings.get_pos_vocabulary()
no_of_unique_tags = len(pos2index)

seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_pos_sentences:
    
    for i in range(len(sentence)-window_size-1):
        
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)
        seq_out.append(y)
Esempio n. 22
0
def build_embedding(idxs=None, sequence_embeddings=None):
    return Embeddings(vocab.size(),
                      opts.embedding_dim,
                      idxs=idxs,
                      sequence_embeddings=sequence_embeddings)
Esempio n. 23
0
# model_name = 'places365vgg'
# layer_name = 'prob'
# model_path = './mynet.npy'

model_name = 'places365resnetft'
layer_name = 'bn5a_branch2c'
model_path = './resnet365ft.npy'

# model_name = 'places365resnet'
# layer_name = 'prob'
# model_path = './resnet365.npy'

path1 = "/run/user/1000/gvfs/sftp:host=10.2.36.75,user=anjan/tmp/anjan/2014-05-06-12-54-54/stereo/centre_corrected/"
path2 = "/run/user/1000/gvfs/sftp:host=10.2.36.75,user=anjan/tmp/anjan/2014-05-06-12-54-54/mono_rear_corrected/"
clf = Embeddings(model_name, layer_name, model_path)

imagenames1 = [f for f in listdir(path1)]
imagenames2 = [f for f in listdir(path2)]
imagenames1.sort()
imagenames2.sort()
images1 = [join(path1, i) for i in imagenames1]
images2 = [join(path2, i) for i in imagenames2]

temp1 = [[imagenames1[i], imagenames2[i]] for i in range(25)]

temp = [[
    int(imagenames1[i][:-4]) - int(imagenames1[i - 1][:-4]),
    int(imagenames2[i][:-4]) - int(imagenames2[i - 1][:-4])
] for i in range(1, 50)]
Esempio n. 24
0
def test_word2vec_size():
    embed = Embeddings('./data/word2vec.txt', True, 4)
    matrix = embed.matrix
    assert (matrix[embed['a']] == np.ones((1, ))).all()
    assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
Esempio n. 25
0
def main():
    sem_eval_data_dir = './data/semeval-2010-task-8'
    sem_eval_indices = [0, 1, 3, 5, 6, 7]

    train_words, train_starts, train_pos, train_link, train_dep, train_ent_labels = \
        load_conll(os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT.all'), sem_eval_indices)

    train_starts = str_to_int(train_starts)
    train_link = str_to_int(train_link)

    train_rel_labels, train_pair_positions = load_relations(
        os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT'))

    train_branch1, train_branch2 = build_branches_indices(
        train_pair_positions, train_starts, train_link)

    test_words, test_starts, test_pos, test_link, test_dep, test_ent_labels = \
        load_conll(os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT.all'), sem_eval_indices)

    test_starts = str_to_int(test_starts)
    test_link = str_to_int(test_link)

    test_rel_labels, test_pair_positions = load_relations(
        os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT'))
    test_branch1, test_branch2 = build_branches_indices(
        test_pair_positions, test_starts, test_link)

    rel_classes = sorted(set(train_rel_labels + train_rel_labels))
    rel_to_index = {l: i for i, l in enumerate(rel_classes)}
    index_to_relation = {i: l for i, l in enumerate(rel_classes)}

    pos_classes = sorted(
        {l
         for sent_pos in train_pos + test_pos for l in sent_pos})
    pos_to_index = build_labels_mapping(pos_classes)

    label_classes = sorted({
        l
        for sent_labels in train_ent_labels + test_ent_labels
        for l in sent_labels
    })
    label_to_index = build_labels_mapping(label_classes)
    index_to_label = build_indices_mapping(label_classes)

    dep_classes = sorted(
        {l
         for sent_dep in train_dep + test_dep for l in sent_dep})
    dep_to_index = build_labels_mapping(dep_classes)

    word_set = {w for sent in train_words + test_words for w in sent}

    print(f'{len(word_set)} unique words found.')

    embed = Embeddings('./embeddings/eng/glove.6B.300d.txt',
                       True,
                       word_set=word_set)
    embed_matrix = embed.matrix

    train_inputs = make_rel_ext_inputs(train_words, embed, train_pos,
                                       pos_to_index, train_ent_labels,
                                       label_to_index, train_dep, dep_to_index,
                                       train_branch1, train_branch2)
    train_outputs = [[rel_to_index[l]] for l in train_rel_labels]

    test_inputs = make_rel_ext_inputs(test_words, embed, test_pos,
                                      pos_to_index, test_ent_labels,
                                      label_to_index, test_dep, dep_to_index,
                                      test_branch1, test_branch2)

    model = build_rel_ext_model(len(rel_classes), embed_matrix,
                                len(label_classes), len(dep_classes),
                                len(pos_classes))

    train_generator = DataGenerator(train_inputs, (train_outputs, []), 32)

    evaluator = ModelEval(DataGenerator(test_inputs), test_rel_labels,
                          index_to_relation)

    model_saver = ModelCheckpoint(filepath='./checkpoints/' +
                                  model.name.replace(' ', '_') +
                                  '_{epoch:02d}.hdf5',
                                  verbose=1,
                                  save_best_only=True,
                                  monitor='valid_f1',
                                  mode='max')

    time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    csv_logger = CSVLogger(f"./logs/RE_log_{time_stamp}.csv", append=False)

    #model.load_weights('./checkpoints/relation_classifier_20.hdf5')

    model.fit_generator(train_generator,
                        epochs=20,
                        callbacks=[evaluator, model_saver, csv_logger])

    test_pred_indices = predict(model, DataGenerator(test_inputs))
Esempio n. 26
0
import numpy as np
import label_predict
from embeddings import Embeddings
import nearest_neighbour
from data_source import LazyLoadWrapper, ResizeWrapper, ReshapeWrapper, Mnist

embeddings = Embeddings()

imageData = LazyLoadWrapper(
    ResizeWrapper(ReshapeWrapper(Mnist(False), [28, 28, 1]), [32, 32]))

embeddings.data_set = imageData.getImages()

number = 4

print "The number is", imageData.getLabels()[number]

nearest = nearest_neighbour.byIndex(number,
                                    embeddings.getEmbeddings(),
                                    size=200)
result = zip(imageData.getLabels()[nearest], nearest)
nearest = []
negative_examples = []
for label, data_index in result:
    label = np.argmax(label)
    if label == 4:
        nearest.append(data_index)
    else:
        negative_examples.append(data_index)

print "Pretend labeling the first", len(nearest), " ..."
Esempio n. 27
0
words = []


class MyCorpusTest(MyCorpus):
    def getMapX(self, line, index):
        if (index == self.startToken):
            words.append(line[1])
        return super(MyCorpusTest, self).getMapX(line, index)

    def getMapY(self, line, index):
        return []


emb = Embeddings(
    fname="",  # "/data/wordembeddings/cc.en.300.vec",
    ws="http://127.0.0.1:8023/wordvectors_get?w1={}",
    unknownStore="unknown.300.vec",
    embSize=300)

print("Loading saved model")
json_file = open('model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
print("Loading weights")
model.load_weights("model/model.h5")
print("Loaded model from disk")

for corpus in ["corp", "equi", "wind"]:
    print("Processing corpus %s" % (corpus))
Esempio n. 28
0
    # sanity check for decoder
    batch_size = 64
    hidden_dim = 512
    input_dim = 100
    output_dim = 100
    max_length = 100
    num_heads = 8
    inner_dim = 1024
    dropout = 0.1
    num_layers = 12
    pad_id = 0
    seq_len = 100

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    embedding_layer = Embeddings(input_dim,
                                 hidden_dim,
                                 max_length,
                                 device,
                                 static=False)
    decoder_layer = DecoderLayer(hidden_dim, num_heads, inner_dim, dropout)
    decoder = Decoder(output_dim, hidden_dim, embedding_layer, decoder_layer,
                      num_layers, dropout)

    source = torch.LongTensor(batch_size, seq_len).random_(input_dim)
    source_mask = (source != pad_id).unsqueeze(1).unsqueeze(2)

    target = torch.LongTensor(batch_size, seq_len).random_(input_dim)
    target_mask = (target != pad_id).unsqueeze(1).unsqueeze(2)

    encoded_source = torch.rand(batch_size, seq_len, hidden_dim)
    output = decoder(target, target_mask, encoded_source, source_mask)
Esempio n. 29
0
def test_fast_text_fixed_size():
    embed = Embeddings('./data/fast_text.txt', True, 4, is_fast_text=True)
    matrix = embed.matrix
    assert (matrix[embed['a']] == np.ones((1, ))).all()
    assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
Esempio n. 30
0
def test_word2vec_case_sensitive():
    embed = Embeddings('./data/w2v_case_sensitive.txt', False, 4)
    matrix = embed.matrix
    assert (matrix[embed['A']] == np.ones((1, ))).all()
    assert (matrix[embed['a']] == np.ones((1, )) * 2).all()