Esempio n. 1
0
    def build_embeddings(self,
                         embeddings_vec_path,
                         *raw_datas,
                         oov_as_unk=True,
                         lower=True):
        """ Build Embeddings object that includes vector of words in data.

        Args:
            embeddings_vec_path (str): Path to the pretrained word vector file.
                Ex. FastText.
            raw_datas (list of dict): List of raw data **TOKENIZED** with
                tokenize_data load from json file.
            oov_as_unk (bool): Whether or not treat words not in pretrained
                word vectors set as OOVs. Otherwise, OOVs' embeddings will be
                randomly initialized.
        """
        words = {}
        for raw_data in raw_datas:
            words = self._collect_words(raw_data, words)

        self.embeddings = Embeddings(embeddings_vec_path,
                                     words,
                                     oov_as_unk,
                                     lower=True)
        self.embeddings.add('<pad>',
                            torch.tensor([0.] * self.embeddings.get_dim()))
        self.embeddings.add('<teacher>')
        self.embeddings.add('<student>')
        self.embeddings.add('CANNOTANSWER')
Esempio n. 2
0
 def __init__(self, text, args, device):
     super(NMT, self).__init__()
     self.text = text
     self.args = args
     self.device = device
     self.Embeddings = Embeddings(args['embed_size'], self.text)
     self.encoder_layer = nn.TransformerEncoderLayer(
         d_model=args['d_model'],
         nhead=args['nhead'],
         dim_feedforward=args['dim_feedforward'],
         dropout=args['dropout'])
     self.encoder_norm = nn.LayerNorm(args['d_model'])
     self.encoder = nn.TransformerEncoder(
         encoder_layer=self.encoder_layer,
         num_layers=args['num_encoder_layers'],
         norm=self.encoder_norm)
     self.decoder_layer = nn.TransformerDecoderLayer(
         d_model=args['d_model'],
         nhead=args['nhead'],
         dim_feedforward=args['dim_feedforward'],
         dropout=args['dropout'])
     self.decoder_norm = nn.LayerNorm(args['d_model'])
     self.decoder = nn.TransformerDecoder(
         decoder_layer=self.decoder_layer,
         num_layers=args['num_decoder_layers'],
         norm=self.decoder_norm)
     self.project = nn.Linear(args['d_model'],
                              len(self.text.tar),
                              bias=False)
     self.project.weight = self.Embeddings.tar.weight
     self.dropout = nn.Dropout(args['dropout'])
     self.project_value = math.pow(args['d_model'], 0.5)
     self.eps = args['smoothing_eps']
Esempio n. 3
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    # The dimension of multi-head model is the same as the embedding. Is it must?
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
Esempio n. 4
0
    def choose_mnist(self):
        print "CHOSE MNIST"
        global predictor, autoencode_model, embeddings

        predictor = autoencode_predict.predict(
            name="meta-data/mnist/autoencode_model", color_depth=1)
        predictor.stop()
        predictor.restore()
        autoencode_model = predictor.autoencode_model
        embeddings = Embeddings(predictor)

        print "Loading images ..."
        if 'mnist' not in self.data_sets:
            print "Key missing.  Building ImageData"
            imageData = LazyLoadWrapper(
                BatchWrapper(
                    ResizeWrapper(ReshapeWrapper(Mnist(), [28, 28, 1]),
                                  [32, 32])))
            imageData.getImages()
            self.data_sets['mnist'] = imageData

        print "  mnist shape is", self.data_sets['mnist'].getImages().shape
        print "... loading images done"
        embeddings.data_set = self.data_sets['mnist'].getImages()
        return self.data_sets['mnist']
Esempio n. 5
0
    def __init__(self, iT, corefs, model):
        self.iT = iT
        self.corefs = corefs
        self.embeddings = Embeddings(model)

        dist, components = self.computeProgression()

        self.distances = dist
        self.components = components
Esempio n. 6
0
    def __init__(self,
                 X_train: list,
                 Y_train: list,
                 embed_path: str,
                 embed_dim: int,
                 stop_words=[],
                 X_test=[],
                 Y_test=[],
                 max_len=None,
                 epochs=3,
                 batch_size=256):

        # Preprocessing the text
        X_train = [clean_text(text, stop_words=stop_words) for text in X_train]
        Y_train = np.asarray(Y_train)

        # Tokenizing the text
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        # Saving the tokenizer
        self.tokenizer = tokenizer

        # Creating the embedding matrix
        embedding = Embeddings(embed_path, embed_dim)
        embedding_matrix = embedding.create_embedding_matrix(
            tokenizer, len(tokenizer.word_counts))

        # Creating the padded input for the deep learning model
        if max_len is None:
            max_len = np.max([len(text.split()) for text in X_train])
        TextToTensor_instance = TextToTensor(tokenizer=tokenizer,
                                             max_len=max_len)
        X_train = TextToTensor_instance.string_to_tensor(X_train)

        # Creating the model
        rnn = RnnModel(embedding_matrix=embedding_matrix,
                       embedding_dim=embed_dim,
                       max_len=max_len)
        rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs)

        self.model = rnn.model

        # If X_test is provided we make predictions with the created model
        if len(X_test) > 0:
            X_test = [clean_text(text) for text in X_test]
            X_test = TextToTensor_instance.string_to_tensor(X_test)
            yhat = [x[0] for x in rnn.model.predict(X_test).tolist()]

            self.yhat = yhat

            # If true labels are provided we calculate the accuracy of the model
            if len(Y_test) > 0:
                self.acc = accuracy_score(Y_test,
                                          [1 if x > 0.5 else 0 for x in yhat])
                self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
Esempio n. 7
0
 def __init__(self, data_name, num_class=5):
     self.data_name = data_name
     self.train_data_path = '../data/' + self.data_name + '/train.txt'
     self.test_data_path = '../data/' + self.data_name + '/test.txt'
     self.dev_data_path = '../data/' + self.data_name + '/dev.txt'
     self.embeddings = Embeddings(data_name)
     self.num_class = num_class
     start_time = time.time()
     self.load_data()
     print('Reading datasets comsumes %.3f seconds' %
           (time.time() - start_time))
Esempio n. 8
0
class Pipeline():
    def __init__(self, text, model='Word2Vec'):
        self.model = Embeddings(model)
        self.model.fit_corpus(text)
        self.model.train()

    def evaluate(self, test='word-similarity', datasets=['wordsim353-rel']):
        evaluator = Evaluator(test='word-similarity',
                              datasets=['wordsim353-rel'],
                              metric='spearman')
        return evaluator.evaluate(self.model)
Esempio n. 9
0
def set_up():
    if request.is_json:
        content = request.get_json()
        if content['key'] == 'fox':
            Loader().download_all_models()
        if content['key'] == 'snake':
            embedding_model = Embeddings()
        if content['key'] == 'sitara':
            Loader().download_all_models()
            embedding_model = Embeddings()
        return 'All data is downloaded'
Esempio n. 10
0
class Sentiment(object):
    """NP_chunking data preparation"""
    def __init__(self, data_name, num_class=5):
        self.data_name = data_name
        self.train_data_path = '../data/' + self.data_name + '/train.txt'
        self.test_data_path = '../data/' + self.data_name + '/test.txt'
        self.dev_data_path = '../data/' + self.data_name + '/dev.txt'
        self.embeddings = Embeddings(data_name)
        self.num_class = num_class
        start_time = time.time()
        self.load_data()
        print('Reading datasets comsumes %.3f seconds' %
              (time.time() - start_time))

    def deal_with_data(self, path):
        users, products, labels, docs, len_docs, len_words = [], [], [], [], [], []
        k = 0
        for line in open(path, 'r', encoding='UTF-8'):
            tokens = line.strip().split('\t\t')
            users.append(tokens[0])
            products.append(tokens[1])
            labels.append(int(tokens[2]) - 1)
            doc = tokens[3].strip().split('<sssss>')
            len_docs.append(len(doc))
            doc = [sentence.strip().split(' ') for sentence in doc]
            len_words.append([len(sentence) for sentence in doc])
            docs.append(doc)
            k += 1
        return users, products, labels, docs

    def load_data(self):
        train_users, train_products, train_labels, train_docs = self.deal_with_data(
            self.train_data_path)
        test_users, test_products, test_labels, test_docs = self.deal_with_data(
            self.test_data_path)
        dev_users, dev_products, dev_labels, dev_docs = self.deal_with_data(
            self.dev_data_path)

        train_docs = self.embeddings.docs2ids(train_docs)
        test_docs = self.embeddings.docs2ids(test_docs)
        dev_docs = self.embeddings.docs2ids(dev_docs)

        train_users = self.embeddings.users2ids(train_users)
        test_users = self.embeddings.users2ids(test_users)
        dev_users = self.embeddings.users2ids(dev_users)

        train_products = self.embeddings.prdts2ids(train_products)
        test_products = self.embeddings.prdts2ids(test_products)
        dev_products = self.embeddings.prdts2ids(dev_products)

        self.train_set = list(
            zip(train_docs, train_labels, train_users, train_products))
        self.test_set = list(
            zip(test_docs, test_labels, test_users, test_products))
        self.dev_set = list(zip(dev_docs, dev_labels, dev_users, dev_products))
Esempio n. 11
0
def get_pretrained_embeddings(path, vocab, method='word2vec'):
    emb = Embeddings()
    model = emb.load_model(method=method, model_path=path)
    embed_size = model.vector_size
    embeddings = np.zeros((len(vocab),embed_size))
    oov_count = 0
    for word in vocab:
        word_index = vocab[word]
        if word in model.vocab:
            embeddings[word_index] = model[word]
        else:
            oov_count += 1
    print('OOV count: %i'%oov_count)
    return embeddings.astype('float32')
Esempio n. 12
0
    def __init__(self):

        self.embeddings = Embeddings(path='Data/wordvectors.kv')

        with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp:
            self.document_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp:
            self.term_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/document_length_text.p', 'rb') as fp:
            self.document_length = pickle.load(fp)

        self.num_documents = len(self.term_frequencies)
        self.avg_length = mean(self.document_length.values())
Esempio n. 13
0
    def test(self, test_dir, log_file, pos_thres, neg_thres):
        logging.info("Testing network using " + test_dir)

        iter = DataSetIterator(self, test_dir, 0)

        self._cat_embeddings = Embeddings(self._model_dir + "/cat.emb", w2v_layer_size, 0, False)
        self._slot_embeddings = Embeddings(self._model_dir + "/slot.emb", w2v_layer_size, 0, False)
        self._dist_embeddings = Embeddings(self._model_dir + "/dist.emb", w2v_layer_size, 0, False)
        self._pos_embeddings = Embeddings(self._model_dir + "/pos.emb", w2v_layer_size, 0, False)

        saver = tf.train.Saver()

        model_path = self._model_dir + "/model.out"

        with tf.Session() as sess:
            saver.restore(sess, model_path)

            batch_xs, batch_ys, records_in_batch = iter.next()

            logging.info("Number of test examples: " + str(len(records_in_batch)))

            correct_prediction = tf.equal(tf.argmax(self._network,1), tf.argmax(self._y,1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            y_p = tf.argmax(self._network, 1)
            y_p_raw = tf.split(1, 2, tf.nn.softmax(self._network))[1]

            val_accuracy, y_network, y_network_raw = sess.run([accuracy, y_p, y_p_raw], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: 1.0, self._hidden_keep_prob: 1.0})

            y_true = np.argmax(batch_ys, 1)

            logging.info("Accuracy: " + str(val_accuracy))

            self._evaluate_thresholds(y_true, y_network_raw, pos_thres, neg_thres)

        with open(log_file+".classified1", "w") as out_correct, \
             open(log_file+".classified0", "w") as out_incorrect:

            logging.info("Writing to files")

            for i in range(len(records_in_batch)):
                prediction = y_network[i]

                if prediction >= 0.5:
                    out_correct.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n")
                else:
                    out_incorrect.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n")

        logging.info("Network testing complete")
Esempio n. 14
0
def test_all_terms_have_embeddings(path_terms: str,
                                   path_embeddings: str) -> None:
    """Test if all terms have embeddings for the given two files.

    Args:
        path_terms: Text-file with 1 term per line.
        path_embeddings: vec-file with term and dimension values
            separated by space.
    """
    terms = load_terms(path_terms)
    idx_to_term = json.load('<intput here correct path>')
    embeddings = Embeddings.load_term_embeddings(terms, path_embeddings,
                                                 idx_to_term)
    embedded_terms = set(embeddings)
    not_in_et = []
    for t in terms:
        if t not in embedded_terms:
            not_in_et.append(t)

    if len(not_in_et) != 0:
        msg1 = 'Error! Not all terms have embeddings. '
        msg2 = 'Num terms without embeddings: {}. '.format(len(not_in_et))
        if len(not_in_et) < 20:
            msg3 = 'Terms without embeddings: {}'.format(not_in_et)
        else:
            msg3 = ''
        raise Exception(msg1 + msg2 + msg3)
Esempio n. 15
0
def test_word2vec_set():
    embed = Embeddings('./data/word2vec.txt', True, word_set={'a', 'b', 'c'})
    matrix = embed.matrix
    assert matrix.shape == (5, 3)
    assert len(embed.vocab) == 3
    assert (matrix[embed['a']] == np.ones((1, ))).all()
    assert (matrix[embed['c']] == np.ones((1, )) * 3).all()
Esempio n. 16
0
    def __init__(self, text, options, device):
        super(NMT, self).__init__()
        self.options = options
        self.embeddings = Embeddings(options.embed_size, text)
        self.hidden_size = options.hidden_size
        self.window_size_d = options.window_size_d
        self.text = text
        self.device = device
        self.encoder_layer = options.encoder_layer 
        self.decoder_layers = options.decoder_layers

        self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False)
        self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False)
        self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False)
        self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False)
Esempio n. 17
0
	def embedKG(self):
		self.logger.info("Embedding NP and relation phrases");

		fname1 = self.p.out_path + self.p.file_entEmbed
		fname2 = self.p.out_path + self.p.file_relEmbed

		if not checkFile(fname1) or not checkFile(fname2):
			embed = Embeddings(self.p, self.side_info, self.logger)
			embed.fit()

			self.ent2embed = embed.ent2embed			# Get the learned NP embeddings
			self.rel2embed = embed.rel2embed			# Get the learned RP embeddings

			pickle.dump(self.ent2embed, open(fname1, 'wb'))
			pickle.dump(self.rel2embed, open(fname2, 'wb'))
		else:
			self.logger.info('\tLoading cached Embeddings')
			self.ent2embed = pickle.load(open(fname1, 'rb'))
			self.rel2embed = pickle.load(open(fname2, 'rb'))
Esempio n. 18
0
    def choose_garden(self):
        print "CHOSE GARDEN"
        global predictor, autoencode_model, embeddings

        predictor = autoencode_predict.predict(
            name="meta-data/garden/garden_model", color_depth=3)
        predictor.stop()
        predictor.restore()
        autoencode_model = predictor.autoencode_model
        embeddings = Embeddings(predictor)
        config_data = json.load(open("data/file_data.json", "r"))

        print "Loading images ..."
        if 'garden' not in self.data_sets:
            print "Key missing.  Building ImageData"

            print "Loading files ...",
            files = LazyLoadWrapper(
                ResizeWrapper(
                    FileReader(config_data["file_names"],
                               config_data["labels"]), [64, 64]))
            files.init()
            print "done."
            print "Calculating full size ...",
            full_size = LazyLoadWrapper(ResizeWrapper(files, [32, 32]))
            full_size.init()
            print "done."
            print "Calculating half size ...",
            half_size = LazyLoadWrapper(SliceWrapper(files, 32, 16))
            half_size.init()
            print "done."
            print "Calculating concat the whole thing ...",
            self.data_sets['garden'] = LazyLoadWrapper(
                BatchWrapper(ConcatWrapper([full_size, half_size])))
            print "done."

            self.data_sets['garden'].getImages()

        print "  garden shape is", self.data_sets['garden'].getImages().shape
        print "... loading images done"
        embeddings.data_set = self.data_sets['garden'].getImages()
        return self.data_sets['garden']
Esempio n. 19
0
def embeddings(args):
    kf = KFold(n_splits=args.splits_num, shuffle=args.shuffle, random_state=42)

    score_lst = list()

    for fold, (train_index, valid_index) in enumerate(kf.split(users)):
        train_users = users[train_index]
        train_movies = movies[train_index]
        train_ratings = ratings[train_index]

        valid_users = users[valid_index]
        valid_movies = movies[valid_index]
        valid_ratings = ratings[valid_index]

        model = Embeddings(
            number_of_users,
            number_of_movies,
            embeddings_size=args.embeddings_size,
            dropout_embeddings=args.embeddings_dropout_embeddings,
            dropout=args.embeddings_dropout)

        model.fit(train_users,
                  train_movies,
                  train_ratings,
                  valid_users=valid_users,
                  valid_movies=valid_movies,
                  valid_ratings=valid_ratings,
                  epochs=args.embeddings_num_epochs,
                  verbose=args.verbose,
                  decay=args.embeddings_decay,
                  decay_steps=args.embeddings_decay_steps,
                  learning_rate=args.embeddings_learning_rate,
                  batch_size=args.embeddings_batch_size)

        preds = model.predict(valid_users, valid_movies)

        score = root_mean_square_error(valid_ratings, preds)
        score_lst.append(score)

        print("Fold:", fold + 1, "score:", score)

    print('Mean CV RMSE:', np.mean(score_lst))
Esempio n. 20
0
def embedding():
    if request.is_json:
        content = request.get_json()
        serializer = EmbeddingSerializer(data=content)
        if not serializer.is_valid():
            return 'Error'
        text = serializer.text
        token = serializer.token
        vector = Embeddings().build_sentence_vector(text).tolist()
        data = json.dumps({"vector": vector, "token": token})
        return data
Esempio n. 21
0
def get_clus_center(
    node: int,
    taxonomy: taxonomy_type,
    path_out: str,
) -> Iterator[float]:
    """Get the cluster center for given node id."""
    emb_path = os.path.join(path_out, 'embeddings/' + str(node) + '.vec')
    term_ids = set([t[0] for t in taxonomy[node]['terms']])
    local_embeddings = Embeddings.load_term_embeddings(
        term_ids, emb_path, term_ids_to_embs_global)
    clus_center = mean(local_embeddings, axis=0)
    return clus_center
Esempio n. 22
0
def load_term_ids_to_embs_global(lemmatized: bool, emb_type: str,
                                 path_out: str):
    """Load global term embeddings."""
    global term_ids_to_embs_global
    path_emb_dir = os.path.join(path_out, 'embeddings/')
    if lemmatized:
        fname = 'embs_lemma_global_{}.vec'.format(emb_type)
    else:
        fname = 'embs_token_global_{}.vec'.format(emb_type)

    emb_path = path_emb_dir + fname
    term_ids = load_term_ids(lemmatized, path_out)
    term_ids_to_embs_global = Embeddings.load_term_embeddings(
        term_ids, emb_path, {})
Esempio n. 23
0
    def __init__(self, path: str):
        """Initialize a hypernym classifier. Currently only svm
        classification is implemented.

        Args:
            path: The path to the output directory.
        """
        # Set paths.
        self.path = path
        self.path_idx_to_term = os.path.join(
            path, 'indexing/idx_to_token.json')
        self.path_term_to_idx = os.path.join(
            path, 'indexing/token_to_idx.json')
        self.path_embs = os.path.join(
            path, 'embeddings/embs_token_global_Word2Vec.vec')
        self.path_hearst = os.path.join(
            path, 'hierarchy/hierarch_rels_tokens_tg_idx.json')

        # Load data.
        with open(self.path_idx_to_term, 'r', encoding='utf8') as f:
            self.idx_to_term = {int(k): v for k, v in json.load(f).items()}
        with open(self.path_term_to_idx, 'r', encoding='utf8') as f:
            self.term_to_idx = json.load(f)
        with open(self.path_hearst, 'r', encoding='utf8') as f:
            self.hearst = {int(k): v for k, v in json.load(f).items()}
        self.hearst_term_ids = set()
        for hyper in self.hearst:
            self.hearst_term_ids.add(hyper)
            for hypo in self.hearst[hyper]:
                self.hearst_term_ids.add(hypo)
        self.path_term_idxs = os.path.join(
            self.path, 'processed_corpus/token_terms_idxs.txt')
        with open(self.path_term_idxs, 'r', encoding='utf8') as f:
            self.term_ids = set([int(i) for i in f.readlines()])
        self.embedding_dict = Embeddings.load_term_embeddings(
            set(self.idx_to_term.keys()), self.path_embs, self.idx_to_term)

        # Instanciate classifier.
        self.clf = SVC(kernel='rbf', C=10, gamma=0.1, probability=True,
                       random_state=0)
Esempio n. 24
0
    def load_embeddings_vocab(self):
        pretrained_embeddings = Embeddings()

        # read filtered embeddings
        if not tf.gfile.Exists(config.filtered_embeddings_path):
            word_to_vec = pretrained_embeddings.load_universal_embeddings()

            self.create_vocabulary(
                self.vocab_file,
                pretrained_embeddings.all_words(word_to_vec),
                tokenizer=None)
            word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file)

            filtered_embeddings = pretrained_embeddings.filter_vocab_embeddings(
                word_to_vec, word_to_idx.keys())

            with open(config.filtered_embeddings_path, 'wb') as output_file:
                pickle.dump(filtered_embeddings,
                            output_file,
                            protocol=pickle.HIGHEST_PROTOCOL)

        else:
            word_to_idx, idx_to_word = self.read_vocabulary(self.vocab_file)
            word_prob = self.read_unigram_freq(self.unigram_prob_file)
            assert 1.01 > sum(
                [0 if val is None else val
                 for val in word_prob.values()]) > 0.99, "What?!"

        pre_embs_dict, embd_dim = pretrained_embeddings.load_filtered_pretrained_embeddings(
            config.filtered_embeddings_path)
        word_vec = pretrained_embeddings.get_embedding_matrix(
            pre_embs_dict, word_to_idx, embd_dim)

        self.word_vec = word_vec
        self.word_prob = word_prob
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word

        train_path = os.path.join(config.data_dir, config.data_files['train'])
        dev_path = os.path.join(config.data_dir, config.data_files['dev'])
        self.write_data_to_token_ids(train_path, target_path=train_path)
        self.write_data_to_token_ids(dev_path, target_path=dev_path)
Esempio n. 25
0
def prepare_word_embeddings(query_lang_emb, qlang_long,
                            doc_lang_emb, dlang_long,
                            limit_emb, normalize=False, processes=40):
    """
    Creates Word Embedding Helper Object
    :param query_lang_emb: language of queries
    :param qlang_long: short version
    :param doc_lang_emb: language of documents
    :param dlang_long: short version
    :param limit_emb: load only first n embeddings
    :param normalize: transform to unit vectors
    :param processes: number of parallel workers
    :return:
    """
    embeddings = Embeddings()
    embeddings.load_embeddings(query_lang_emb, processes=processes, language=qlang_long,
                               limit=limit_emb, normalize=normalize)
    embeddings.load_embeddings(doc_lang_emb, processes=processes, language=dlang_long,
                               limit=limit_emb, normalize=normalize)
    return embeddings
Esempio n. 26
0
class Network:
    def __init__(self, path, train, helper):
        self._helper = helper
        self._train_bool = train

        n_properties = self._helper.n_properties
        n_input = n_properties * w2v_layer_size

        if self._train_bool:
            self._prev_model = path
            logging.info("Using previous word2vec model: " + self._prev_model)
            self._word_vectors = WordVectors(self._prev_model, w2v_layer_size, "UNKNOWN")
        else:
            self._model_dir = path
            self._word_vectors = WordVectors(self._model_dir + "/word2vec.txt", w2v_layer_size, "UNKNOWN")

        self._x = tf.placeholder("float", [None, n_input])
        self._y = tf.placeholder("float", [None, n_classes])
        self._input_keep_prob = tf.placeholder("float")
        self._hidden_keep_prob = tf.placeholder("float")

        # ReLU
        w_h_stddev = math.sqrt(2 / n_input)

        # Xavier
        w_out_stddev = math.sqrt(3 / (nn_hidden_layer_size + n_classes))

        self._weights = {
            "h": tf.Variable(tf.truncated_normal([n_input, nn_hidden_layer_size], stddev=w_h_stddev), name="w_h"),
            "out": tf.Variable(tf.truncated_normal([nn_hidden_layer_size, n_classes], stddev=w_out_stddev), name="w_out")
        }

        self._biases = {
            "h": tf.Variable(tf.constant(0.1, shape=[nn_hidden_layer_size]), name="b_h"),
            "out": tf.Variable(tf.constant(0.0, shape=[n_classes]), name="b_out")
        }

        self._network = self._multilayer_perceptron(self._x, self._weights, self._biases)

    def _multilayer_perceptron(self, _X, _weights, _biases):
        input_layer_drop = tf.nn.dropout(_X, self._input_keep_prob)
        hidden_layer = tf.nn.relu(tf.add(tf.matmul(input_layer_drop, _weights["h"]), _biases["h"]))
        hidden_layer_drop = tf.nn.dropout(hidden_layer, self._hidden_keep_prob)
        return tf.matmul(hidden_layer_drop, _weights["out"]) + _biases["out"]

    def train(self, train_dir, model_dir):
        logging.info("Training network using " + train_dir)

        iter = DataSetIterator(self, train_dir, nn_batch_size)

        self._cat_embeddings = Embeddings(iter.cat_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._slot_embeddings = Embeddings(iter.slot_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._dist_embeddings = Embeddings(iter.dist_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._pos_embeddings = Embeddings(iter.pos_lexicon, w2v_layer_size, nn_embed_random_range, True)

        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self._network, self._y))
        regularizers = tf.nn.l2_loss(self._weights["h"]) + tf.nn.l2_loss(self._weights["out"]) + tf.nn.l2_loss(self._biases["h"]) + tf.nn.l2_loss(self._biases["out"])
        cost += nn_l2_reg * regularizers

        optimizer = tf.train.AdagradOptimizer(learning_rate=nn_learning_rate).minimize(cost)
        grads_wrt_input_op = tf.gradients(cost, self._x)[0]

        init = tf.initialize_all_variables()
        saver = tf.train.Saver(max_to_keep=0)

        with tf.Session() as sess:
            sess.run(init)

            for epoch in range(1, nn_epochs+1):
                logging.info("Training epoch " + str(epoch))

                curr_batch = 1
                sum_cost = 0

                while True:
                    next_batch = iter.next()
                    if not next_batch:
                        break

                    batch_xs, batch_ys, records_in_batch = next_batch

                    logging.info("Training batch " + str(epoch) + "/" + str(curr_batch))

                    _, grads_wrt_input = sess.run([optimizer, grads_wrt_input_op], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout})

                    logging.info("Network updated")

                    for i in range(len(records_in_batch)):
                        record = records_in_batch[i]
                        grad_wrt_input = nn_learning_rate * grads_wrt_input[i]
                        record.update_embeddings(grad_wrt_input, w2v_layer_size, self._cat_embeddings, self._slot_embeddings, self._dist_embeddings, self._pos_embeddings)

                    logging.info("Embeddings updated")

                    curr_cost = sess.run(cost, feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout})

                    logging.info("Cost: " + str(curr_cost))

                    curr_batch += 1
                    sum_cost += curr_cost

                logging.info("Epoch cost: " + str(sum_cost/float(curr_batch-1)))

                model_epoch_dir = model_dir + "/epoch" + str(epoch)

                if not os.path.exists(model_epoch_dir):
                    os.makedirs(model_epoch_dir)

                self._serialize(saver, sess, model_epoch_dir)

                iter.reset()

            self._serialize(saver, sess, model_dir)

            logging.info("Network training complete")

    def test(self, test_dir, log_file, pos_thres, neg_thres):
        logging.info("Testing network using " + test_dir)

        iter = DataSetIterator(self, test_dir, 0)

        self._cat_embeddings = Embeddings(self._model_dir + "/cat.emb", w2v_layer_size, 0, False)
        self._slot_embeddings = Embeddings(self._model_dir + "/slot.emb", w2v_layer_size, 0, False)
        self._dist_embeddings = Embeddings(self._model_dir + "/dist.emb", w2v_layer_size, 0, False)
        self._pos_embeddings = Embeddings(self._model_dir + "/pos.emb", w2v_layer_size, 0, False)

        saver = tf.train.Saver()

        model_path = self._model_dir + "/model.out"

        with tf.Session() as sess:
            saver.restore(sess, model_path)

            batch_xs, batch_ys, records_in_batch = iter.next()

            logging.info("Number of test examples: " + str(len(records_in_batch)))

            correct_prediction = tf.equal(tf.argmax(self._network,1), tf.argmax(self._y,1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            y_p = tf.argmax(self._network, 1)
            y_p_raw = tf.split(1, 2, tf.nn.softmax(self._network))[1]

            val_accuracy, y_network, y_network_raw = sess.run([accuracy, y_p, y_p_raw], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: 1.0, self._hidden_keep_prob: 1.0})

            y_true = np.argmax(batch_ys, 1)

            logging.info("Accuracy: " + str(val_accuracy))

            self._evaluate_thresholds(y_true, y_network_raw, pos_thres, neg_thres)

        with open(log_file+".classified1", "w") as out_correct, \
             open(log_file+".classified0", "w") as out_incorrect:

            logging.info("Writing to files")

            for i in range(len(records_in_batch)):
                prediction = y_network[i]

                if prediction >= 0.5:
                    out_correct.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n")
                else:
                    out_incorrect.write(" ".join(records_in_batch[i].list) + " " + str(int(records_in_batch[i].value)) + "\n")

        logging.info("Network testing complete")

    def _evaluate_thresholds(self, y_true, y_network_raw, pos_thres, neg_thres):
        for j in range(5, 10):
            pos_threshold = j / float(10)
            neg_threshold = (10 - j) / float(10)

            self._evaluate_threshold(y_true, y_network_raw, pos_threshold, neg_threshold)

        self._evaluate_threshold(y_true, y_network_raw, pos_thres, neg_thres)

    def _evaluate_threshold(self, y_true, y_network_raw, pos_threshold, neg_threshold):
        sub_true = list()
        sub_network = list()

        for i in range(len(y_true)):
            # inverse logit
            prediction = y_network_raw[i]

            if prediction >= pos_threshold:
                sub_true.append(y_true[i])
                sub_network.append(1)
            elif prediction <= neg_threshold:
                sub_true.append(y_true[i])
                sub_network.append(0)

        logging.info("Evaluation threshold: " + str(pos_threshold) + ", " + str(neg_threshold))

        sub_true.append(0)
        sub_network.append(0)
        sub_true.append(0)
        sub_network.append(1)
        sub_true.append(1)
        sub_network.append(0)
        sub_true.append(1)
        sub_network.append(1)

        confusion_matrix = sklearn.metrics.confusion_matrix(sub_true, sub_network)
        confusion_matrix -= 1

        logging.info("Examples labeled as 0 classified by model as 0: " + str(confusion_matrix[0][0]))
        logging.info("Examples labeled as 0 classified by model as 1: " + str(confusion_matrix[0][1]))
        logging.info("Examples labeled as 1 classified by model as 0: " + str(confusion_matrix[1][0]))
        logging.info("Examples labeled as 1 classified by model as 1: " + str(confusion_matrix[1][1]))

        logging.info("")

    def _writeUTF(self, string):
        utf8 = string.encode("utf-8")
        length = len(utf8)
        return struct.pack("!H", length) + struct.pack("!" + str(length) + "s", utf8)

    def _serialize(self, saver, sess, model_dir):
        logging.info("Serializing network")
        saver.save(sess, model_dir + "/model.out")

        wh = self._weights["h"].eval().reshape((1,-1), order="F")
        wout = self._weights["out"].eval().reshape((1,-1), order="F")
        bh = self._biases["h"].eval().reshape((1,-1), order="F")
        bout = self._biases["out"].eval().reshape((1,-1), order="F")

        h = np.hstack((wh, bh, wout, bout))

        if sys.byteorder == "little":
            h.byteswap(True)

        r, c = h.shape

        with open(model_dir + "/coeffs", "wb") as coeffs_file:
            coeffs_file.write(struct.pack("!i", 2))
            coeffs_file.write(struct.pack("!i", r))
            coeffs_file.write(struct.pack("!i", c))
            coeffs_file.write(struct.pack("!i", 1))
            coeffs_file.write(struct.pack("!i", 1))
            coeffs_file.write(self._writeUTF("float"))
            coeffs_file.write(self._writeUTF("real"))
            coeffs_file.write(self._writeUTF("HEAP"))
            coeffs_file.write(struct.pack("!i", c))
            coeffs_file.write(self._writeUTF("FLOAT"))

        with open(model_dir + "/coeffs", "ab") as coeffs_file:
            h.tofile(coeffs_file, "")

        logging.info("Serializing embeddings")
        self._cat_embeddings.serialize(model_dir + "/cat.emb")
        self._slot_embeddings.serialize(model_dir + "/slot.emb")
        self._dist_embeddings.serialize(model_dir + "/dist.emb")
        self._pos_embeddings.serialize(model_dir + "/pos.emb")
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint
from nltk.tokenize import word_tokenize
import random

embeddings = Embeddings(100, 4, 1, 4)

# getting data from preprocessing
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

# generating training data
indow_size = 5
vocab_size = len(word2index)
print(vocab_size)

model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
from keras.callbacks import ModelCheckpoint
from embeddings import Embeddings

word_embedding_dimension = 100
word_embedding_window_size = 4
batch_size = 128 
epochs = 10 
window_size = 5 
accuracy_threshold = 0.85
activation = 'relu' 
custom_accuracy = 0
loss_function = 'mse' 

model_name = 'POS_GRU ' + loss_function + "_"+ str(custom_accuracy) + "_" + activation + "_" + str(window_size) + "_" + str(batch_size)

embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)
tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences()
pos2index, index2pos = embeddings.get_pos_vocabulary()
no_of_unique_tags = len(pos2index)

seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_pos_sentences:
    
    for i in range(len(sentence)-window_size-1):
        
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)
        seq_out.append(y)
Esempio n. 29
0
def build_embedding(idxs=None, sequence_embeddings=None):
    return Embeddings(vocab.size(),
                      opts.embedding_dim,
                      idxs=idxs,
                      sequence_embeddings=sequence_embeddings)
Esempio n. 30
0
class NMT(nn.Module):

    def __init__(self, text, options, device):
        super(NMT, self).__init__()
        self.options = options
        self.embeddings = Embeddings(options.embed_size, text)
        self.hidden_size = options.hidden_size
        self.window_size_d = options.window_size_d
        self.text = text
        self.device = device
        self.encoder_layer = options.encoder_layer 
        self.decoder_layers = options.decoder_layers

        self.encoder = nn.LSTM(input_size=options.embed_size, hidden_size=options.hidden_size, num_layers=options.encoder_layer, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.decoder = nn.LSTM(input_size=options.embed_size+options.hidden_size, hidden_size=options.hidden_size, num_layers=options.decoder_layers, bias=True, dropout=options.dropout_rate, bidirectional=False)
        self.ht2tan = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False)
        self.tan2pt = nn.Linear(in_features=self.hidden_size, out_features=1, bias=False)
        self.ct2ht = nn.Linear(in_features=self.hidden_size*2, out_features=self.hidden_size, bias=False)
        self.ht2final = nn.Linear(in_features=self.hidden_size, out_features=len(self.text.tar), bias=False)
    
    def forward(self, source, target):
        len_ = []
        for sen in source:
            len_.append(len(sen))
        source_tensor = self.text.src.word2tensor(source, self.device).cuda()
        target_tensor = self.text.tar.word2tensor(target, self.device).cuda()
        encode_h, encode_len, encode_hn_cn = self.encode(source_tensor, len_)
        decode_out = self.decode(source_tensor, encode_hn_cn, encode_h, encode_len, target_tensor)
        P = nn.functional.log_softmax(self.ht2final(decode_out), dim=-1)  # sen_len * batch * vocab_size
        tar_mask = (target_tensor != self.text.tar['<pad>']).float()
        tar_log_pro = torch.gather(P, index=target_tensor[1:].unsqueeze(-1), dim=-1).squeeze(-1) * tar_mask[1:]
        return tar_log_pro.sum(dim=0)

    def encode(self, source_tensor, source_length):
        x = self.embeddings.src(source_tensor)
        source_length_tensor = torch.tensor(source_length, dtype=torch.int64)
        x = pack_padded_sequence(x, source_length_tensor.cpu(), enforce_sorted=False)
        output, (hn, cn) = self.encoder(x)
        output, each_len = pad_packed_sequence(output)
        output = output.permute(1, 0, 2)
        return output, each_len, (hn, cn)
    
    def decode(self, source_tensor, h0_c0, encode_h, encode_len, target_tensor):
        y = self.embeddings.tar(target_tensor)
        ht_ct = h0_c0
        ht = torch.zeros(encode_h.shape[0], self.hidden_size, device=self.device).cuda()
        output = []
        for y_t in y:
            now_ht_ct, now_ht = self.step(source_tensor, encode_h, encode_len, torch.cat((y_t, ht), dim=1).view(1, y.shape[1], -1), ht_ct)
            output.append(now_ht)
            ht_ct = now_ht_ct
            ht = now_ht
        return torch.stack(output).to(self.device).cuda() # sen_len * batch * hidden_size
    #@profile
    def step(self, source, encode_h, encode_len, pre_yt, pre_ht_ct):
        '''
        yt, ht_ct = self.decoder(pre_yt, pre_ht_ct)
        yt = torch.squeeze(yt, dim=0)
        pt = nn.functional.sigmoid(self.tan2pt(nn.functional.tanh(self.ht2tan(yt))))
        batch_ct = None
        with torch.no_grad():
            for i, each_pt in enumerate(pt):
                each_pt = encode_len[i].item() * each_pt.item()
                left = max(0, int(each_pt) - self.window_size_d)
                right = min(encode_len[i].item(), int(each_pt) + self.window_size_d)
                align = None
                for j in range(left, right):
                    if (j == left):
                        align = encode_h[i][j].view(1, -1)
                    else:
                        align = torch.cat((align, encode_h[i][j].view(1, -1)), dim=0)
                align = nn.functional.softmax(torch.squeeze(torch.bmm(yt[i].view(1, 1, -1), align.t().unsqueeze(dim=0)), dim=0).squeeze(dim=0))
                ex_p = torch.zeros(right-left, dtype=torch.float16)
                for j in range(left, right):
                    ex_p[j-left] = math.exp(-(j-each_pt)*(j-each_pt)/(self.window_size_d*self.window_size_d/2))
                ex_p = ex_p.to(self.device).cuda()
                align = align.to(self.device).cuda()
                at = align * ex_p
                ct = torch.zeros(self.hidden_size, dtype=torch.float16)
                ct = ct.to(self.device).cuda()
                for j in range(left, right):
                    ct += at[j-left]*encode_h[i][j]
                if (i == 0):
                    batch_ct = torch.cat((ct.view(1, -1), yt[i].view(1, -1)), dim=1)
                else:
                    batch_ct = torch.cat((batch_ct, torch.cat((ct.view(1, -1), yt[i].view(1, -1)), dim=1)), dim=0)
        #batch_ct = torch.zeros(pt.shape[0], self.hidden_size * 2, device=self.device)
        ht = nn.functional.tanh(self.ct2ht(batch_ct))
        batch_ct = None
        return ht_ct, ht
        '''
        encode_len = encode_len.cuda()
        yt, ht_ct = self.decoder(pre_yt, pre_ht_ct)
        yt = torch.squeeze(yt, dim=0) # batch * hidden_size
        batch_size = yt.shape[0]
        pt = torch.sigmoid(self.tan2pt(torch.tanh(self.ht2tan(yt)))).view(yt.shape[0]) * encode_len # batch
        pt = pt.view(batch_size, 1)  # batch * 1
        #with torch.no_grad():
        # encode_h : batch * sen_len * hidden_size
        pre_align = torch.bmm(yt.view(batch_size, 1, self.hidden_size), torch.transpose(encode_h, 1, 2)).squeeze(dim=1) # batch * sen_len
        src_mask = (source == self.text.src['<pad>']).long().t()
        src_mask.cuda()
        #shuhe = torch.full((batch_size, encode_h.shape[1]), float("-inf"), dtype=torch.float, device=self.device)
        '''
        shuhe = torch.zeros((batch_size, encode_h.shape[1]), dtype=float)
        for i in range(batch_size):
            shuhe[i][encode_len[i].item():] = float('-inf')
        '''
        '''
        for i in range(batch_size):
            pre_align[i][encode_len[i].item():] = float('-inf')
        '''
        pre_align.data.masked_fill_(src_mask.bool(), float('-inf'))
        #sdz pre_align = pre_align - torch.tensor(shuhe, dtype=torch.float, device=self.device, requires_grad=False).reshape(batch_size, encode_h.shape[1])
        align = nn.functional.softmax(pre_align, dim=-1) # batch * sen_len
        per_s = torch.arange(0, encode_h.shape[1], dtype=torch.long, device=self.device).view(1, encode_h.shape[1]).expand(batch_size, encode_h.shape[1])
        at = align * torch.exp(-(torch.pow(per_s-pt, 2)/(self.window_size_d*self.window_size_d/2))) # batch * sen_len
        at = at.view(batch_size, -1, 1)
        pre_ct = at * encode_h # batch * sen_len * hidden_size
        ct = torch.cat((pre_ct.sum(dim=1), yt), dim=-1)
        ht = torch.tanh(self.ct2ht(ct))
        return ht_ct, ht
        
    
    def beam_search(self, src, search_size, max_tar_length, test_batch_size):
        '''
        src_tensor = self.text.src.word2tensor(src, self.device)
        all_h, encode_len, (h_n, c_n) = self.encode(src_tensor, [len(src)])
        sen_len = all_h.shape[1]
        new_all_h = all_h
        for i in range(search_size-1):
            new_all_h = torch.cat((new_all_h, all_h), dim=0)
        all_h = new_all_h
        all_h = all_h.cuda()
        encode_len = []
        for i in range(search_size):
            encode_len.append(len(src))
        encode_len = torch.tensor(encode_len, dtype=torch.long, device=self.device)
        encode_len = encode_len.cuda()
        h_n = h_n.cuda()
        c_n = c_n.cuda()
        now_h = h_n
        now_c = c_n
        end_id = self.text.tar['<end>']
        now_predict = [[self.text.tar['<start>']]]
        now_predict_words = [self.text.tar['<start>']]
        now_batch_word_tensor = torch.cat((self.embeddings.tar(torch.tensor([self.text.tar['<start>']], dtype=torch.long, device=self.device).cuda()), torch.zeros(1, self.hidden_size, dtype=torch.float, device=self.device).cuda()), dim=-1).reshape(1, 1, -1)
        predict = []
        now_predict_length = 0
        while (len(predict) < search_size and now_predict_length < max_tar_length):
            now_predict_length += 1
            next_ht_ct, next_ht = self.step(all_h[:len(now_predict)].reshape(len(now_predict), sen_len, -1), encode_len[:len(now_predict)], now_batch_word_tensor, (now_h, now_c))
            now_h, now_c = next_ht_ct
            now_h = now_h.permute(1, 0, 2)
            now_c = now_c.permute(1, 0, 2)
            P = nn.functional.softmax(self.ht2final(next_ht), dim=-1)
            padding_score = None
            for i in range(len(now_predict_words)):
                if (i == 0):
                    padding_score = P[i]
                else:
                    padding_score = torch.cat((padding_score, P[i]), dim=-1)
            _, topk_index = torch.topk(padding_score, search_size)
            next_predict_words = []
            next_predict = []
            next_h = None
            next_c = None
            now_final_h = None
            for i in range(search_size):
                next_word_id = topk_index[i].item() % len(self.text.tar)
                batch_id = topk_index[i].item() // len(self.text.tar)
                now_sen = now_predict[batch_id]
                if (next_word_id == end_id):
                    predict.append(now_sen[1:])
                    if (len(predict) == search_size):
                        break
                    continue
                next_predict_words.append(next_word_id)
                now_sen.append(next_word_id)
                next_predict.append(now_sen)
                if (next_h is None):
                    next_h = now_h[batch_id].reshape(1, 4, -1)
                    next_c = now_c[batch_id].reshape(1, 4, -1)
                    now_final_h = next_ht[batch_id].reshape(1, -1)
                else:
                    next_h = torch.cat((next_h, now_h[batch_id].reshape(1, 4, -1)), dim=0)
                    next_c = torch.cat((next_c, now_c[batch_id].reshape(1, 4, -1)), dim=0)
                    now_final_h = torch.cat((now_final_h, next_ht[batch_id].reshape(1, -1)), dim=0)
            if (len(predict) == search_size):
                break
            if (now_predict_length == max_tar_length):
                for sen in next_predict:
                    predict.append(sen[1:])
                    if (len(predict) == search_size):
                        break
            now_predict_words = next_predict_words
            now_predict = next_predict
            now_h = next_h.view(4, next_h.shape[0], -1).contiguous()
            now_c = next_c.view(4, next_c.shape[0], -1).contiguous()
            now_batch_word_tensor = torch.cat((self.embeddings.tar(torch.tensor(now_predict_words, dtype=torch.long, device=self.device)), now_final_h), dim=1)
            now_batch_word_tensor = now_batch_word_tensor.reshape(1, now_batch_word_tensor.shape[0], now_batch_word_tensor.shape[1])
        return predict
        '''
        encode_len = []
        for i in range(test_batch_size):
            encode_len.append(len(src[i]))
        src_tensor = self.text.src.word2tensor(src, self.device)
        now_source = src_tensor
        all_h, encode_len, (h_n, c_n) = self.encode(src_tensor, encode_len)
        sen_len = all_h.shape[1]
        now_all_h = all_h
        encode_len = torch.tensor(encode_len, dtype=torch.long, device=self.device)
        encode_len = encode_len.cuda()
        now_encode_len = encode_len
        now_h = h_n
        now_c = c_n
        predict = [[] for _ in range(test_batch_size)]
        now_predict = [[0] for _ in range(test_batch_size)]
        now_batch_word_tensor = torch.cat((self.embeddings.tar(self.text.tar.word2tensor(now_predict, self.device)).squeeze(dim=0), torch.zeros(test_batch_size, self.hidden_size, dtype=torch.float, device=self.device).cuda()), dim=-1).reshape(1, test_batch_size, -1)
        now_predict_length = 0
        now_score = torch.zeros(test_batch_size, dtype=torch.float, device=self.device).reshape(test_batch_size, 1)
        batch_index = [(i, 1) for i in range(test_batch_size)]
        while (now_predict_length < max_tar_length):
            now_predict_length += 1
            next_ht_ct, next_ht = self.step(now_source, now_all_h, now_encode_len, now_batch_word_tensor.contiguous(), (now_h, now_c))
            P = (nn.functional.softmax(self.ht2final(next_ht), dim=-1)+now_score).reshape(next_ht.shape[0]*len(self.text.tar))
            next_batch_index = []
            now_start = 0
            next_predict = []
            next_score = []
            next_words = []
            next_all_h = None
            next_encode_len = []
            next_h = None
            next_c = None
            now_ht = None
            next_source = None
            now_source = src_tensor.t()
            now_h, now_c = next_ht_ct
            now_h = now_h.permute(1, 0, 2)
            now_c = now_c.permute(1, 0, 2)
            flag = False
            for key, value in batch_index:
                score, topk_index = torch.topk(P[len(self.text.tar)*now_start:len(self.text.tar)*(value+now_start)], search_size)
                next_value = 0
                now_flag = False
                for i in range(search_size):
                    next_word_id = topk_index[i].item() % len(self.text.tar)
                    sent_id = topk_index[i].item() // len(self.text.tar)
                    if (next_word_id == self.text.tar['<end>']):
                        if (len(now_predict[now_start+sent_id][1:]) == 0):
                            continue
                        predict[key].append(((score[i].item()-now_score[now_start][0].item())/math.pow(len(now_predict[now_start+sent_id][1:]), config.alpha), now_predict[now_start+sent_id][1:].copy()))
                        if (len(predict[key]) == search_size):
                            now_flag = True
                            break
                        continue
                now_start += value
                if (now_flag):
                    continue
                for i in range(search_size):
                    next_word_id = topk_index[i].item() % len(self.text.tar)
                    sent_id = topk_index[i].item() // len(self.text.tar)
                    if (next_word_id == self.text.tar['<end>']):
                        continue
                    if (now_predict_length == max_tar_length):
                        predict[key].append((score[i].item()/math.pow(len(now_predict[now_start-value+sent_id][1:])+1, config.alpha), now_predict[now_start-value+sent_id][1:].copy()))
                        predict[key][-1][1].append(next_word_id)
                        if (len(predict[key]) == search_size):
                            now_flag = True
                            break
                        continue
                    next_value += 1
                    next_predict.append(now_predict[now_start-value+sent_id].copy())
                    next_predict[-1].append(next_word_id)
                    next_score.append(score[i].item())
                    next_words.append([next_word_id])
                    if (next_all_h is None):
                        next_all_h = all_h[key].reshape(1, -1, self.hidden_size)
                        next_encode_len.append(encode_len[key].item())
                        next_h = now_h[now_start-value+sent_id].reshape(1, 4, -1)
                        next_c = now_c[now_start-value+sent_id].reshape(1, 4, -1)
                        now_ht = next_ht[now_start-value+sent_id].reshape(1, -1)
                        next_source = now_source[key].reshape(1, -1)
                    else:
                        next_all_h = torch.cat((next_all_h, all_h[key].reshape(1, sen_len, self.hidden_size)), dim=0)
                        next_encode_len.append(encode_len[key].item())
                        next_h = torch.cat((next_h, now_h[now_start-value+sent_id].reshape(1, 4, -1)), dim=0)
                        next_c = torch.cat((next_c, now_c[now_start-value+sent_id].reshape(1, 4, -1)), dim=0)
                        now_ht = torch.cat((now_ht, next_ht[now_start-value+sent_id].reshape(1, -1)), dim=0)
                        next_source = torch.cat((next_source, now_source[key].reshape(1, -1)), dim=0)
                if (now_flag):
                    continue
                flag = True
                next_batch_index.append((key, next_value))
            if (not flag):
                break
            now_source = next_source.t()
            now_score = torch.tensor(next_score, dtype=torch.float, device=self.device).reshape(-1, 1)
            now_all_h = next_all_h
            now_encode_len = torch.tensor(next_encode_len, dtype=torch.long, device=self.device)
            now_h = next_h.permute(1, 0, 2).contiguous()
            now_c = next_c.permute(1, 0, 2).contiguous()
            now_predict = next_predict
            batch_index = next_batch_index
            now_batch_word_tensor = torch.cat((self.embeddings.tar(self.text.tar.word2tensor(next_words, self.device)).squeeze(dim=0), now_ht), dim=-1).reshape(1, len(next_encode_len), -1)
        output = []
        for sub in predict:
            sub = sorted(sub, key=lambda sc: sc[0], reverse=True)
            output.append(sub[0][1])
        return output

    @staticmethod
    def load(model_path):
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        model = NMT(params['text'], params['options'], params['device'])
        model.load_state_dict(params['state_dict'])
        return model
    
    def save(self, model_path):
        print(f"save model to path [{model_path}]")
        params = {
            'text': self.text,
            'options': self.options,
            'device': self.device,
            'state_dict': self.state_dict()
        }
        torch.save(params, model_path)
Esempio n. 31
0
    def train(self, train_dir, model_dir):
        logging.info("Training network using " + train_dir)

        iter = DataSetIterator(self, train_dir, nn_batch_size)

        self._cat_embeddings = Embeddings(iter.cat_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._slot_embeddings = Embeddings(iter.slot_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._dist_embeddings = Embeddings(iter.dist_lexicon, w2v_layer_size, nn_embed_random_range, True)
        self._pos_embeddings = Embeddings(iter.pos_lexicon, w2v_layer_size, nn_embed_random_range, True)

        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self._network, self._y))
        regularizers = tf.nn.l2_loss(self._weights["h"]) + tf.nn.l2_loss(self._weights["out"]) + tf.nn.l2_loss(self._biases["h"]) + tf.nn.l2_loss(self._biases["out"])
        cost += nn_l2_reg * regularizers

        optimizer = tf.train.AdagradOptimizer(learning_rate=nn_learning_rate).minimize(cost)
        grads_wrt_input_op = tf.gradients(cost, self._x)[0]

        init = tf.initialize_all_variables()
        saver = tf.train.Saver(max_to_keep=0)

        with tf.Session() as sess:
            sess.run(init)

            for epoch in range(1, nn_epochs+1):
                logging.info("Training epoch " + str(epoch))

                curr_batch = 1
                sum_cost = 0

                while True:
                    next_batch = iter.next()
                    if not next_batch:
                        break

                    batch_xs, batch_ys, records_in_batch = next_batch

                    logging.info("Training batch " + str(epoch) + "/" + str(curr_batch))

                    _, grads_wrt_input = sess.run([optimizer, grads_wrt_input_op], feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout})

                    logging.info("Network updated")

                    for i in range(len(records_in_batch)):
                        record = records_in_batch[i]
                        grad_wrt_input = nn_learning_rate * grads_wrt_input[i]
                        record.update_embeddings(grad_wrt_input, w2v_layer_size, self._cat_embeddings, self._slot_embeddings, self._dist_embeddings, self._pos_embeddings)

                    logging.info("Embeddings updated")

                    curr_cost = sess.run(cost, feed_dict={self._x: batch_xs, self._y: batch_ys, self._input_keep_prob: nn_dropout, self._hidden_keep_prob: nn_dropout})

                    logging.info("Cost: " + str(curr_cost))

                    curr_batch += 1
                    sum_cost += curr_cost

                logging.info("Epoch cost: " + str(sum_cost/float(curr_batch-1)))

                model_epoch_dir = model_dir + "/epoch" + str(epoch)

                if not os.path.exists(model_epoch_dir):
                    os.makedirs(model_epoch_dir)

                self._serialize(saver, sess, model_epoch_dir)

                iter.reset()

            self._serialize(saver, sess, model_dir)

            logging.info("Network training complete")
Esempio n. 32
0
 def build_gru(name, idxs):
     embeddings = Embeddings(vocab_size, embedding_dim, idxs=idxs)
     return GruRnn(name, embedding_dim, hidden_dim, opts, update_fn, h0,
                   embeddings.embeddings())
Esempio n. 33
0
class Embedding_retrieval:
    def __init__(self):

        self.embeddings = Embeddings(path='Data/wordvectors.kv')

        with open('Data/ranking_dict/document_frequencies_text.p', 'rb') as fp:
            self.document_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/term_frequencies_text.p', 'rb') as fp:
            self.term_frequencies = pickle.load(fp)

        with open('Data/ranking_dict/document_length_text.p', 'rb') as fp:
            self.document_length = pickle.load(fp)

        self.num_documents = len(self.term_frequencies)
        self.avg_length = mean(self.document_length.values())

    def get_closest_sentence(self, query, id, doc, topk=3):
        k = 1.5
        b = 0.75
        weights = []
        for term in re.findall(r"[\w']+|[.,!?;]", query):
            term = term.lower()
            if not term in self.document_frequencies:
                continue
            df = self.document_frequencies[term]
            idf = np.log((self.num_documents - df + 0.5) / (df + 0.5))
            document_dict = self.term_frequencies[id]
            if not term in document_dict:
                weights.append(0)
                continue
            tf = document_dict[term]
            wd = (
                (tf * (k + 1)) /
                (tf + k *
                 (1 - b + b * self.document_length[id] / self.avg_length))) + 1
            weights.append(idf * wd)

        query_embedding = self.weighted_embedding(query, weights)
        doc_embedding = []
        tokenized_sent = tokenize.sent_tokenize(doc)
        for sent in tokenized_sent:
            try:
                doc_embedding.append(self.sent_embedding(sent))
            except:
                print(sent)
                raise Exception('F**k off')

        scores = []
        query_norm = np.linalg.norm(query_embedding)
        for i, emb in enumerate(doc_embedding):
            sent_norm = np.linalg.norm(emb)
            if sent_norm == 0:
                scores.append((i, 0))
            else:
                scores.append(
                    (i,
                     np.dot(emb, query_embedding) / (sent_norm * query_norm)))

        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        most_similar = []
        for index, _ in scores[:topk]:
            most_similar.append(tokenized_sent[index])

        return most_similar

    def weighted_embedding(self, query, weights):
        sum_weights = sum(weights)
        #weights = [w/sum_weights for w in weights]
        embeddings = []
        for term in re.findall(r"[\w']+|[.,!?;]", query):
            term = term.lower()
            embeddings.append(self.embeddings.wv[term])
        ones = np.ones(len(embeddings)) / len(embeddings)
        return np.dot(ones, embeddings)

    def sent_embedding(self, sentence):
        embeddings = None
        count = 0
        for term in re.findall(r"[\w']+|[.,!?;]", sentence):
            term = term.lower()
            if term in self.embeddings.wv.vocab:
                if embeddings is None:
                    embeddings = self.embeddings.get_embedding(term)
                else:
                    embeddings = np.add(embeddings,
                                        self.embeddings.get_embedding(term))
                count += 1
            else:
                pass
                #print(term)
        if embeddings is None:
            #print('Embeddings none for sentence: {}'.format(sentence))
            return np.zeros(100)
        return embeddings / count
Esempio n. 34
0
def main():
    sem_eval_data_dir = './data/semeval-2010-task-8'
    sem_eval_indices = [0, 1, 3, 5, 6, 7]

    train_words, train_starts, train_pos, train_link, train_dep, train_ent_labels = \
        load_conll(os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT.all'), sem_eval_indices)

    train_starts = str_to_int(train_starts)
    train_link = str_to_int(train_link)

    train_rel_labels, train_pair_positions = load_relations(
        os.path.join(sem_eval_data_dir, 'TRAIN_FILE.TXT'))

    train_branch1, train_branch2 = build_branches_indices(
        train_pair_positions, train_starts, train_link)

    test_words, test_starts, test_pos, test_link, test_dep, test_ent_labels = \
        load_conll(os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT.all'), sem_eval_indices)

    test_starts = str_to_int(test_starts)
    test_link = str_to_int(test_link)

    test_rel_labels, test_pair_positions = load_relations(
        os.path.join(sem_eval_data_dir, 'TEST_FILE_FULL.TXT'))
    test_branch1, test_branch2 = build_branches_indices(
        test_pair_positions, test_starts, test_link)

    rel_classes = sorted(set(train_rel_labels + train_rel_labels))
    rel_to_index = {l: i for i, l in enumerate(rel_classes)}
    index_to_relation = {i: l for i, l in enumerate(rel_classes)}

    pos_classes = sorted(
        {l
         for sent_pos in train_pos + test_pos for l in sent_pos})
    pos_to_index = build_labels_mapping(pos_classes)

    label_classes = sorted({
        l
        for sent_labels in train_ent_labels + test_ent_labels
        for l in sent_labels
    })
    label_to_index = build_labels_mapping(label_classes)
    index_to_label = build_indices_mapping(label_classes)

    dep_classes = sorted(
        {l
         for sent_dep in train_dep + test_dep for l in sent_dep})
    dep_to_index = build_labels_mapping(dep_classes)

    word_set = {w for sent in train_words + test_words for w in sent}

    print(f'{len(word_set)} unique words found.')

    embed = Embeddings('./embeddings/eng/glove.6B.300d.txt',
                       True,
                       word_set=word_set)
    embed_matrix = embed.matrix

    train_inputs = make_rel_ext_inputs(train_words, embed, train_pos,
                                       pos_to_index, train_ent_labels,
                                       label_to_index, train_dep, dep_to_index,
                                       train_branch1, train_branch2)
    train_outputs = [[rel_to_index[l]] for l in train_rel_labels]

    test_inputs = make_rel_ext_inputs(test_words, embed, test_pos,
                                      pos_to_index, test_ent_labels,
                                      label_to_index, test_dep, dep_to_index,
                                      test_branch1, test_branch2)

    model = build_rel_ext_model(len(rel_classes), embed_matrix,
                                len(label_classes), len(dep_classes),
                                len(pos_classes))

    train_generator = DataGenerator(train_inputs, (train_outputs, []), 32)

    evaluator = ModelEval(DataGenerator(test_inputs), test_rel_labels,
                          index_to_relation)

    model_saver = ModelCheckpoint(filepath='./checkpoints/' +
                                  model.name.replace(' ', '_') +
                                  '_{epoch:02d}.hdf5',
                                  verbose=1,
                                  save_best_only=True,
                                  monitor='valid_f1',
                                  mode='max')

    time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    csv_logger = CSVLogger(f"./logs/RE_log_{time_stamp}.csv", append=False)

    #model.load_weights('./checkpoints/relation_classifier_20.hdf5')

    model.fit_generator(train_generator,
                        epochs=20,
                        callbacks=[evaluator, model_saver, csv_logger])

    test_pred_indices = predict(model, DataGenerator(test_inputs))