Example #1
0
def evaluate_model(model, data, corpus, word_to_index, cuda, eval_func):
    rrs = []
    for query in data.keys():
        positives, candidates = data[query]

        embeddings = []
        embeddings.append(
            pad(merge_title_and_body(corpus[query]), len(word_to_index)))
        for candidate in candidates:
            embeddings.append(
                pad(merge_title_and_body(corpus[candidate]),
                    len(word_to_index)))
        embeddings = Variable(torch.from_numpy(np.array(embeddings)))
        if cuda:
            embeddings = embeddings.cuda()

        encodings = model(embeddings)
        similarities = F.cosine_similarity(encodings[1:],
                                           encodings[0].repeat(
                                               len(encodings) - 1, 1),
                                           dim=1)
        _, candidates_ranked = zip(
            *sorted(zip(similarities.data, candidates), reverse=True))
        rrs.append(eval_func(positives, candidates_ranked))
    return np.mean(rrs)
Example #2
0
    def __init__(
        self,
        trees,
        minibatch_size=128,
        wordmap=None,
        shuffle=True
        ):

        self.wordmap = wordmap or self._build_wordmap(trees)

        is_leafs = self._is_leafs(trees)
        word_indices = self._word_indices(trees, self.wordmap)
        child_indices = self._child_indices(trees)
        targets = self._targets(trees)

        # Apply padding
        is_leafs, lengths = helpers.pad(is_leafs, 0)
        word_indices, _ = helpers.pad(word_indices, 0)
        child_indices, _ = helpers.pad(child_indices, 0)
        targets, _ = helpers.pad(targets, 0)

        if shuffle:
            # Shuffle
            lengths, \
            is_leafs, \
            word_indices, \
            child_indices, \
            targets = helpers.parallel_shuffle(
                (lengths, is_leafs, word_indices, child_indices, targets)
            )

            # Now sort in order of increasing input length
            lengths, \
            is_leafs, \
            word_indices, \
            child_indices, \
            targets = helpers.sort_by(
                lengths,
                (lengths, is_leafs, word_indices, child_indices, targets)
            )

        # Transpose data from example -> sequence to sequence -> example
        is_leafs = is_leafs.transpose(1, 0)
        word_indices = word_indices.transpose(1, 0)
        child_indices = child_indices.transpose(1, 2, 0) # seq -> l/r -> ex
        targets = targets.transpose(1, 0)

        # Load into theano shared vars
        self.is_leafs = theano.shared(is_leafs, borrow=True)
        self.word_indices = theano.shared(word_indices, borrow=True)
        self.child_indices = theano.shared(child_indices, borrow=True)
        self.targets = theano.shared(targets, borrow=True)
        # these need to be stored on the GPU as floats and casted when needed
        self.lengths = theano.shared(
            lengths.astype(theano.config.floatX), 
            borrow=True)

        self.minibatch_size = minibatch_size
        self.minibatch_count = int(numpy.ceil(float(len(trees)) / minibatch_size))
Example #3
0
    def __init__(self, trees, minibatch_size=128, wordmap=None, shuffle=True):

        self.wordmap = wordmap or self._build_wordmap(trees)

        is_leafs = self._is_leafs(trees)
        word_indices = self._word_indices(trees, self.wordmap)
        child_indices = self._child_indices(trees)
        targets = self._targets(trees)

        # Apply padding
        is_leafs, lengths = helpers.pad(is_leafs, 0)
        word_indices, _ = helpers.pad(word_indices, 0)
        child_indices, _ = helpers.pad(child_indices, 0)
        targets, _ = helpers.pad(targets, 0)

        if shuffle:
            # Shuffle
            lengths, \
            is_leafs, \
            word_indices, \
            child_indices, \
            targets = helpers.parallel_shuffle(
                (lengths, is_leafs, word_indices, child_indices, targets)
            )

            # Now sort in order of increasing input length
            lengths, \
            is_leafs, \
            word_indices, \
            child_indices, \
            targets = helpers.sort_by(
                lengths,
                (lengths, is_leafs, word_indices, child_indices, targets)
            )

        # Transpose data from example -> sequence to sequence -> example
        is_leafs = is_leafs.transpose(1, 0)
        word_indices = word_indices.transpose(1, 0)
        child_indices = child_indices.transpose(1, 2, 0)  # seq -> l/r -> ex
        targets = targets.transpose(1, 0)

        # Load into theano shared vars
        self.is_leafs = theano.shared(is_leafs, borrow=True)
        self.word_indices = theano.shared(word_indices, borrow=True)
        self.child_indices = theano.shared(child_indices, borrow=True)
        self.targets = theano.shared(targets, borrow=True)
        # these need to be stored on the GPU as floats and casted when needed
        self.lengths = theano.shared(lengths.astype(theano.config.floatX),
                                     borrow=True)

        self.minibatch_size = minibatch_size
        self.minibatch_count = int(
            numpy.ceil(float(len(trees)) / minibatch_size))
Example #4
0
    def __init__(
        self, 
        inputs, 
        targets, 
        minibatch_size=128,
        input_padding_val=0,
        target_padding_val=ctc.PADDING
        ):
        """Load a dataset into Theano shared variables."""

        # Apply padding
        inputs, lengths = helpers.pad(inputs, input_padding_val)
        targets, _ = helpers.pad(targets, target_padding_val)

        # Shuffle
        inputs, \
        targets, \
        lengths = helpers.parallel_shuffle(
            (inputs, targets, lengths)
        )

        # Now sort in order of increasing input length
        inputs, \
        targets, \
        lengths = helpers.sort_by(
            lengths,
            (inputs, targets, lengths)
        )

        # Transpose data from example -> sequence to sequence -> example
        inputs = inputs.transpose(1, 0, 2)
        # Does each example have a target class (instead of an output vector)?
        classification = (len(targets.shape) == 2)
        if classification:
            targets = targets.transpose(1, 0)
        else:
            targets = targets.transpose(1, 0, 2)

        # Load into theano shared vars
        self.inputs = theano.shared(inputs, borrow=True)
        self.targets = theano.shared(targets, borrow=True)
        # this needs to be stored on the GPU as floats and casted when needed
        self.lengths = theano.shared(
            lengths.astype(theano.config.floatX), 
            borrow=True)

        self.minibatch_size = minibatch_size
        self.minibatch_count = int(numpy.ceil(float(inputs.shape[1]) / minibatch_size))

        self.ttargets = ctc.transform_targets(self.targets)
Example #5
0
    def __init__(self,
                 inputs,
                 targets,
                 minibatch_size=128,
                 input_padding_val=0,
                 target_padding_val=ctc.PADDING):
        """Load a dataset into Theano shared variables."""

        # Apply padding
        inputs, lengths = helpers.pad(inputs, input_padding_val)
        targets, _ = helpers.pad(targets, target_padding_val)

        # Shuffle
        inputs, \
        targets, \
        lengths = helpers.parallel_shuffle(
            (inputs, targets, lengths)
        )

        # Now sort in order of increasing input length
        inputs, \
        targets, \
        lengths = helpers.sort_by(
            lengths,
            (inputs, targets, lengths)
        )

        # Transpose data from example -> sequence to sequence -> example
        inputs = inputs.transpose(1, 0, 2)
        # Does each example have a target class (instead of an output vector)?
        classification = (len(targets.shape) == 2)
        if classification:
            targets = targets.transpose(1, 0)
        else:
            targets = targets.transpose(1, 0, 2)

        # Load into theano shared vars
        self.inputs = theano.shared(inputs, borrow=True)
        self.targets = theano.shared(targets, borrow=True)
        # this needs to be stored on the GPU as floats and casted when needed
        self.lengths = theano.shared(lengths.astype(theano.config.floatX),
                                     borrow=True)

        self.minibatch_size = minibatch_size
        self.minibatch_count = int(
            numpy.ceil(float(inputs.shape[1]) / minibatch_size))

        self.ttargets = ctc.transform_targets(self.targets)
Example #6
0
def just_encrypt(msg, key, decrease_unknown=0):
    msg = msg + unknown[decrease_unknown:]    
    # padchars = (random_bytes(5, 10), random_bytes(5, 10))
    # msg = padchars[0] + msg + padchars[1]
    msg = pad(msg, 16)
    encrypted = encrypt_ecb(msg, key)
    return encrypted
Example #7
0
def ising_energy(model, states):
    """
    Compute local energies of Ising model.

    Parameters
    ----------
    states : Tensor of shape (N, system_size)

    Returns
    -------
    energies : tensor of shape (N,)
    """
    batch_size = tf.shape(states)[0]
    states_shaped = tf.reshape(states, (batch_size,)+SYSTEM_SHAPE)
    states_padded = pad(states_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS)
    factors = tf.reshape(model.factors(states_padded), (batch_size, -1))
    factor_windows = all_windows(factors, SYSTEM_SHAPE, HALF_WINDOW_SHAPE)
    spin_windows = all_windows(states, SYSTEM_SHAPE, FULL_WINDOW_SHAPE)
    flipper = np.ones(FULL_WINDOW_SIZE, dtype=np.int32)
    flipper[(FULL_WINDOW_SIZE-1)//2] = -1
    spins_flipped = spin_windows * flipper
    factors_flipped = tf.reshape(
        model.factors(tf.reshape(
            spins_flipped, (batch_size*NUM_SPINS,)+FULL_WINDOW_SHAPE)),
        (batch_size, NUM_SPINS, HALF_WINDOW_SIZE))

    log_pop = tf.reduce_sum(factors_flipped - factor_windows, 2)
    alignedness = tf.reduce_sum(interactions(states, SYSTEM_SHAPE), [1, 2])
    energy = -H * tf.reduce_sum(tf.exp(log_pop), 1) - \
        tf.cast(alignedness, tf.complex64)
    return energy / NUM_SPINS
Example #8
0
    def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None):
        """
        This method runs a simple version of the perceptron algorithm on the data given.
        :param data: numpy array of each data point to be used for training. This array should already be padded with
        a 1's column in order to ensure a bias weight is included.
        :param labels: numpy array specify the labels {-1, 1}
        :return: None
        """
        # Pad the data with an all ones vector.
        p_data = pad(data)

        # Initialize the weights.
        self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1])

        for epoch in range(epochs):

            # Go through each data point.
            for x, y in zip(*shuffle(p_data, labels)):

                # If (w^t*x + b)*y < margin make an update.
                if np.dot(self.weights, x) * y < self.margin:
                    # Calculate the aggressive learning rate.
                    aggressive_learning_rate = (self.margin - (y * np.dot(self.weights, x))) / (np.dot(x, x) + 1)

                    # Update the weights
                    self.weights = self.weights + (aggressive_learning_rate * y * x)

                    # Record update count.
                    self.update_count += 1

            # record epoch specific information if specified
            if record_epochs:
                val_x, val_y = validation_set[0], validation_set[1]
                self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y),
                                                 'weights': self.weights}
def get_dis_batch(batch_size, corpus, word_to_index):
    keys = list(corpus.keys())
    random.shuffle(keys)
    keys = keys[:batch_size]
    query_index_sequences = [merge_title_and_body(corpus[k]) for k in keys]
    embedded = [pad(seq, len(word_to_index)) for seq in query_index_sequences]
    batch = torch.from_numpy(np.array(embedded))
    return batch
Example #10
0
def process_batch_pairs(pairs, data, corpus, word_to_index):
    batch_querys = []
    batch_positives = []
    batch_negatives = []
    for query, positive in pairs:
        query_index_sequence = merge_title_and_body(corpus[query])
        batch_querys.append(pad(query_index_sequence, len(word_to_index)))
        positive_index_sequence = merge_title_and_body(corpus[positive])
        batch_positives.append(pad(positive_index_sequence,
                                   len(word_to_index)))
        negatives = [merge_title_and_body(corpus[neg]) \
                    for neg in random.sample(data[(query, positive)], NEGATIVE_QUERYS_PER_SAMPLE)]
        negatives = [pad(neg, len(word_to_index)) for neg in negatives]
        batch_negatives.append(negatives)

    batch_querys = torch.from_numpy(np.array(batch_querys))
    batch_positives = torch.from_numpy(np.array(batch_positives))
    batch_negatives = torch.from_numpy(np.array(batch_negatives))
    return batch_querys, batch_positives, batch_negatives
def evaluate_model(model, data, corpus, word_to_index, cuda):
    auc = AUCMeter()
    for query in data.keys():
        positives = set(data[query][0])
        candidates = data[query][1]

        embeddings = [pad(merge_title_and_body(corpus[query]), len(word_to_index))]
        targets = []
        for candidate in candidates:
            embeddings.append(pad(merge_title_and_body(corpus[candidate]), len(word_to_index)))
            targets.append(IS_SIMMILAR_LABEL if candidate in positives else NOT_SIMMILAR_LABEL)
        embeddings = Variable(torch.from_numpy(np.array(embeddings)))
        targets = torch.from_numpy(np.array(targets))
        if cuda:
            embeddings = embeddings.cuda()

        encodings = model(embeddings)
        query_encoding = encodings[0]
        candidate_encodings = encodings[1:]
        similarities = (F.cosine_similarity(candidate_encodings, query_encoding.repeat(len(encodings)-1, 1), dim=1))
        auc.add(similarities.data, targets)
    return auc.value(MAXIMUM_FALSE_POSITIVE_RATIO)
Example #12
0
def encrypt_cbc(msg, key, iv):
    cipher = AES.new(key, AES.MODE_ECB)
    msg = pad(msg, 16)
    msg = str2hex(msg)
    blocks = [msg[i:i + 32] for i in range(0, len(msg), 32)]
    encrypted = ''
    en = str2hex(iv)

    for block in blocks:
        ip = xor(en, block)
        en = cipher.encrypt(ip.decode('hex')).encode('hex')
        encrypted += en

    return encrypted
Example #13
0
def heisenberg_energy(model, states):
    """
    Compute local energies of AntiFerroMagneticHeisenberg model.

    Parameters
    ----------
    states : Tensor of shape (N, system_size)

    Returns
    -------
    energies : tensor of shape (N,)
    """
    FULL_WINDOW_SHAPE = (K*2-1+2,)*N_DIMS
    FULL_WINDOW_SIZE = np.prod(FULL_WINDOW_SHAPE)
    HALF_WINDOW_SHAPE = (K+2,)*N_DIMS
    HALF_WINDOW_SIZE = np.prod(HALF_WINDOW_SHAPE)

    batch_size = tf.shape(states)[0]
    states_shaped = tf.reshape(states, (batch_size,)+SYSTEM_SHAPE)
    states_padded = pad(states_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS)
    factors = tf.reshape(model.factors(states_padded), (batch_size, -1))
    factor_windows = all_windows(factors, SYSTEM_SHAPE, HALF_WINDOW_SHAPE)
    spin_windows = all_windows(states, SYSTEM_SHAPE, FULL_WINDOW_SHAPE)

    flippers = np.zeros((N_DIMS, FULL_WINDOW_SIZE), dtype=np.int32)
    for d in range(N_DIMS):
        flipper = np.ones(FULL_WINDOW_SHAPE, dtype=np.int32)
        halfpoint = tuple((s-1)//2 for s in FULL_WINDOW_SHAPE)
        neighbour = tuple(x+1 if i == d else x
                          for i, x in enumerate(halfpoint))
        flipper[halfpoint] = -1
        flipper[neighbour] = -1
        flippers[d, :] = flipper.flatten()

    # [N, flip_direction=1..N_DIMS, spin_window=1..SYSTEM_SIZE, window_size]
    spins_flipped = spin_windows[:, None, :, :] * flippers[None, :, None, :]
    factors_flipped = tf.reshape(
        model.factors(tf.reshape(
            spins_flipped, (batch_size*N_DIMS*NUM_SPINS,)+FULL_WINDOW_SHAPE)),
        (batch_size, N_DIMS, NUM_SPINS, HALF_WINDOW_SIZE))

    log_pop = tf.reduce_sum(factors_flipped - factor_windows[:, None, :, :], 3)
    ints = tf.cast(interactions(states, SYSTEM_SHAPE), tf.complex64)

    terms = -(1 - ints) * tf.exp(log_pop) + ints
    energy = tf.reduce_sum(terms, [1, 2])

    return energy / NUM_SPINS
Example #14
0
def encrypt(message, key, IV = None):
    if (message is None) or (len(message) == 0):
        raise ValueError('message cannot be null or empty')
    if IV is None:
        IV = generateIV(blockSize)
    
    cipherText = bytes(IV)

    paddedMessage = pad(message, blockSize)
    blocks = chunkMessage(paddedMessage, blockSize)
    for block in blocks:
        # update the IV to be the newly encrypted ciphertext.
        IV = encryptBlock(block, key, IV)
        cipherText += IV
    
    return cipherText
Example #15
0
    def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None):
        """
        This method runs a simple version of the perceptron algorithm on the data given.
        :param data: numpy array of each data point to be used for training. This array should already be padded with
        a 1's column in order to ensure a bias weight is included.
        :param labels: numpy array specify the labels {-1, 1}
        :return: None
        """
        # Pad the data with an all ones vector.
        p_data = pad(data)

        # Initialize the weights and average weigths.
        self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1])
        self.average_weights = self.weights

        for epoch in range(epochs):

            # Go through each data point.
            for x, y in zip(*shuffle(p_data, labels)):

                # If (w^t*x + b)*y < 0 make an update.
                if np.dot(self.weights, x) * y < 0:
                    # Update the weights
                    self.weights = self.weights + self.learning_rate * y * x

                    # Record update count.
                    self.update_count += 1

                # Increment the average weights even if no misprediction happens.
                self.average_weights = self.average_weights + self.weights

            # record epoch specific information if specified
            if record_epochs:
                val_x, val_y = validation_set[0], validation_set[1]
                # Set current weights to the averaged weights so predict will use them.
                temp_weights = self.weights
                self.weights = self.average_weights / (len(data) * epoch + 1)
                self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y),
                                                 'weights': self.weights}
                # Set them back to resume normal algorithm operation.
                self.weights = temp_weights

        # Divide by the total number of examples it has seen.
        self.average_weights = self.average_weights / (len(data) * epochs)
        # Finally set the final weights to the average weights so they will be used for predictions.
        self.weights = self.average_weights
Example #16
0
def encrypt_cbc(msg, key, iv='\x00' * 16, is_base64=False):
    cipher = AES.new(key, AES.MODE_ECB)

    if is_base64:
        msg = base64.b64decode(msg)

    msg = pad(msg, 16)
    blocks = [msg[i:i + 16] for i in range(0, len(msg), 16)]
    encrypted = ''
    en = iv

    for block in blocks:
        ip = xor(en, block)
        en = cipher.encrypt(ip)
        encrypted += en

    return encrypted
Example #17
0
def just_encrypt(msg):
    key = random_bytes(16, 16)
    padchars = (random_bytes(5, 10), random_bytes(5, 10))
    msg = padchars[0] + msg + padchars[1]
    msg = pad(msg, 16)
    choice = random.randint(0, 1)

    if choice:
        encrypted = encrypt_ecb(msg, key)
        mode = 'ECB'

    else:
        iv = random_bytes(16, 16)
        encrypted = encrypt_cbc(msg, key, iv)
        mode = 'CBC'

    return (mode, key, encrypted)
Example #18
0
    def predict(self, data):
        """
        This method takes in
        :param data: numpy array of each data point to make a prediction for. This array should already be padded with
        a 1's column in order to ensure a bias weight is taken into consideration. The number of the columns should
        also correspond with the number of weights.
        :return: numpy array containing predictions made for each data point given in the data array.
        """
        # Pad the data with an all ones vector.
        data = pad(data)

        # For each data point make a prediction and store it.
        preds = np.ndarray((len(data), 1))
        for ix, x in enumerate(data):
            preds[ix] = np.sign(np.dot(self.weights, x))

        return preds
Example #19
0
    def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None):
        """
        This method runs a simple version of the perceptron algorithm on the data given.
        :param data: numpy array of each data point to be used for training. This array should already be padded with
        a 1's column in order to ensure a bias weight is included.
        :param labels: numpy array specify the labels {-1, 1}
        :param epochs: number of epochs to run.
        :record_epochs: If set to true will record weights, and accuracy after each epoch.
        :return: None
        """
        # Pad the data with an all ones vector.
        p_data = pad(data)

        # Initialize the weights.
        self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1])

        # Decaying Learning Rate
        t = 0

        for epoch in range(epochs):

            # Go through each data point.
            for x, y in zip(*shuffle(p_data, labels)):

                # If (w^t*x + b)*y < 0 make an update.
                if np.dot(self.weights, x) * y < 0:
                    # Calculate the decayed learning rate.
                    decayed_learning_rate = self.learning_rate / (1 + t)

                    # Update the weights
                    self.weights = self.weights + (self.decayed_learning_rate * y * x)

                    # Record update count.
                    self.update_count += 1

                # Increment t after each example not just mispredictions.
                t += 1

            # record epoch specific information if specified
            if record_epochs:
                val_x, val_y = validation_set[0], validation_set[1]
                self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y),
                                                 'weights': self.weights}
Example #20
0
def optimize_op(sampler, model, energy_fn):
    """
    Perform optimization iteration.

    Returns
    -------
    energies : tensor of shape (N,)
        Energy of MCMC samples
    train_op : Optimization tensorflow op
    """
    with tf.device('/cpu:0'):
        samples = tf.stop_gradient(sampler.mcmc_op())
        energies = energy_fn(samples)
        energies = tf.stop_gradient(energies)

    with tf.device('/gpu:0'):
        samples_shaped = tf.reshape(samples, (NUM_SAMPLES,)+SYSTEM_SHAPE)
        samples_padded = pad(samples_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS)
        loss = loss_op(model.factors(samples_padded), energies)
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
        train_op = optimizer.minimize(loss)

    return energies, train_op
Example #21
0
    def mcmc_reset(self):
        """Reset MCMC variables."""
        uniform_states = tf.random_uniform(
            (self.num_samplers, ) + self.system_shape, 0, 2,
            dtype=tf.int32) * 2 - 1
        uniform_states_padded = pad(uniform_states, self.system_shape,
                                    [(self.r - 1) // 2] * self.n_dims)
        uniform_states_flattened = tf.reshape(uniform_states_padded,
                                              (self.num_samplers, -1))

        states = tf.cond(self.new_samples, lambda: uniform_states_flattened,
                         lambda: self.current_samples_var)

        factors = tf.reshape(
            self.model.factors(
                tf.reshape(states, (self.num_samplers, ) + self.padded_shape)),
            (self.num_samplers, -1))

        return tf.group(
            tf.assign(self.current_samples_var, states),
            tf.assign(self.current_factors_var, factors),
            tf.assign(self.samples_var,
                      tf.zeros_like(self.samples_var, dtype=tf.int32)),
            tf.assign(
                self.flip_positions_var,
                tf.random_uniform(
                    [self.sample_its, self.num_samplers, self.num_flips],
                    0,
                    self.num_spins,
                    dtype=tf.int32)),
            tf.assign(
                self.accept_sample_var,
                tf.random_uniform([self.sample_its, self.num_samplers],
                                  0.,
                                  1.,
                                  dtype=tf.float32)),
        )
Example #22
0
 def handle_requests(self, data, client_address):
         usr_hash, length, start_from, finish_at = helpers.get_request_data(data)
         user_word, ack = helpers.scan_and_compare(start_from, finish_at, usr_hash)
         message_len = len(TEAM_NAME.decode()) + 1 + len(usr_hash.decode()) + 1 + len(user_word)
         print(user_word + "ssssssssss")
         print(TEAM_NAME + ack + usr_hash + bytes([length]) + user_word.encode() + user_word.encode())
         if user_word:
             self.udp_socket.sendto(TEAM_NAME + ack + usr_hash + bytes([length]) + user_word.encode() + helpers.pad(length).encode(), client_address)
         else:
             self.udp_socket.sendto(TEAM_NAME + ack + usr_hash + bytes([0]), client_address)
Example #23
0
import tensorflow as tf
from tensorflow import keras

from helpers import encode_review, decode_review, pad

import numpy as np

# train and test dataset
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

# print(train_data[0])
# len(train_data[0]), len(train_data[1])

train_data = pad(train_data)
test_data = pad(test_data)

# len(train_data[0]), len(train_data[1])
# print(train_data[0])

# build model
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()
Example #24
0
        def convert_to_features(example_batch):
            max_length = self.tokenizer.model_max_length

            articles = example_batch[self.hparams.data_example_column]

            articles_encoded_step = []
            for idx, article in enumerate(articles):
                article = article.strip()
                try:
                    article_encoded = self.tokenizer(
                        article,
                        padding="max_length",
                        truncation=True,
                    )
                    articles_encoded_step.append(article_encoded)
                except Exception:  # skipcq: FLK-E722
                    print("Failed to tokenize article: {}".format(article))
                    sys.exit(1)

                if idx != 0:
                    current_length = len(article_encoded["input_ids"])
                    first_length = len(articles_encoded_step[0]["input_ids"])
                    assert (
                        current_length == first_length
                    ), "The length of the current input, {}, does not match the length of the first input, {}.".format(  # noqa: E501
                        current_length, first_length)

            articles_encoded = {
                "input_ids": [i["input_ids"] for i in articles_encoded_step],
                "attention_mask":
                [i["attention_mask"] for i in articles_encoded_step],
            }

            # articles_encoded = self.tokenizer.batch_encode_plus(
            #     articles, pad_to_max_length=True, truncation=True,
            # )

            highlights = example_batch[self.hparams.data_summarized_column]

            # Tokenize highlights using spacy to split them into sentences if they were not
            # already split in the dataset (use `hparams.split_char` to specify the sentence
            # boundary character)
            if not self.hparams.split_char:
                highlights = tokenize(spacy_nlp,
                                      highlights,
                                      disable_progress_bar=True)

            sep_token = self.tokenizer.sep_token
            highlights_input_ids = []
            highlights_attention_masks = []

            # For each ground-truth summary
            for highlight in highlights:
                if self.hparams.split_char:
                    # simply split into sentences if `hparams.split_char` is specified
                    sents = highlight.split(self.hparams.split_char)
                else:
                    # `highlight` is a list of sentences where each sentence is a list of tokens
                    # Combine those tokens to create a list of sentences.
                    sents = [
                        " ".join(list_of_ids) for list_of_ids in highlight
                    ]

                assert type(sents) is list
                assert len(sents) > 0

                # Tokenize each sentence and append the `sep_token`
                sents_tokenized = []
                for sent in sents:
                    assert type(sent) is str
                    assert len(sent) > 0
                    sent = self.tokenizer.tokenize(sent)
                    sent.append(sep_token)
                    sents_tokenized.append(sent)

                # Delete the last `sep_token` from the last sentence
                assert type(sents_tokenized[-1][-1]) is str
                del sents_tokenized[-1][-1]
                # Flatten `sents_tokenized` (a list of sentences where each sentence is a list
                # of tokens) to a list of tokens
                sents_tokenized_flat = list(
                    itertools.chain.from_iterable(sents_tokenized))
                assert type(sents_tokenized_flat[0]) is str
                assert len(sents_tokenized_flat) > 0

                # Convert the tokens to `input_ids`
                # `max_length` is the max length minus 2 because we need to add the
                # beginning and ending tokens to the target
                sents_input_ids = self.tokenizer.encode_plus(
                    sents_tokenized_flat,
                    truncation=True,
                    is_split_into_words=True,
                    add_special_tokens=False,
                    max_length=(max_length - 2),
                    return_attention_mask=False,
                    return_token_type_ids=False,
                )["input_ids"]

                # Insert beginning of sequence token and append end of sequence token.
                sents_input_ids.insert(0, self.target_boseq_token_id)
                sents_input_ids.append(self.target_eoseq_token_id)

                # Create attention mask
                attention_mask = [1] * len(sents_input_ids)

                # Append the `input_ids` and `attention_mask`
                highlights_input_ids.append(sents_input_ids)
                highlights_attention_masks.append(attention_mask)

            # Pad the highlight input ids and attention masks to `tokenizer.max_len`.
            # The articles have already been padded because they do not need the extra
            # `boseq` and `eoseq` tokens.
            highlights_input_ids = pad(
                highlights_input_ids,
                self.tokenizer.pad_token_id,
                width=max_length,
            )
            highlights_attention_masks = pad(highlights_attention_masks,
                                             0,
                                             width=max_length)

            return {
                "source": articles_encoded["input_ids"],
                "target": highlights_input_ids,
                "source_mask": articles_encoded["attention_mask"],
                "target_mask": highlights_attention_masks,
            }
Example #25
0
 def send_offer_message(self, client_address):
     print(TEAM_NAME + OFFER + (helpers.pad(40)).encode() + bytes([0]))
     self.udp_socket.sendto(TEAM_NAME + OFFER + (helpers.pad(40)).encode() + bytes([0]), client_address)
Example #26
0
def pad_batch_collate(batch, modifier=None):
    r"""
    Collate function to be passed to ``DataLoaders``. PyTorch Docs:
    `https://pytorch.org/docs/stable/data.html#dataloader-collate-fn <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`__

    Calculates padding (per batch for efficiency) of ``labels`` and
    ``token_type_ids`` if they exist within the batch from the ``Dataset``.
    Also, pads ``sent_rep_token_ids`` and creates the ``sent_rep_mask`` to
    indicate which numbers in the ``sent_rep_token_ids`` list are actually
    the locations of sentence representation ids and which are padding.
    Finally, calculates the ``attention_mask`` for each set of ``input_ids``
    and pads both the ``attention_mask`` and the ``input_ids``. Converts all
    inputs to tensors.

    If ``sent_lengths`` are found then they will also automatically be
    padded. However, the padding for sentence lengths is complicated. Each
    list of sentence lengths needs to be the length of the longest list of
    sentence lengths and the sum of all the lengths in each list needs to
    add to the length of the input_ids width (the length of each input_id).
    The second requirement exists because ``torch.split()`` (which is used
    in the ``mean_tokens`` pooling algorithm to convert word vectors to
    sentence embeddings in ``pooling.py``) will split a tensor into the
    lengths requested but will error instead of returning any extra.
    However, ``torch.split()`` will split a tensor into zero length
    segments. Thus, to solve this, zeros are added to each sentence length
    list for each example until one more padding value is needed to get the
    maximum number of sentences. Once only one more value is needed, the
    total value needded to reach the width of the ``input_ids`` is added.

    ``source`` and ``target``, if present, are simply passed on without any
    processing. Therefore, the standard ``collate_fn`` function for
    ``DataLoader``\ s will not work if these are present since they cannot
    be converted to tensors without padding. This ``collate_fn`` must be
    used if ``source`` or ``target`` is present in the loaded dataset.

    The ``modifier`` argument accepts a function that takes the
    ``final_dictionary`` and returns a modified ``final_dictionary``. The
    ``modifier`` function will be called directly before
    ``final_dictionary`` is returned in :meth:`~data.pad_batch_collate`. This allows
    for easy extendability.
    """
    elem = batch[0]
    final_dictionary = {}

    for key in elem:
        # don't process `sent_lengths`
        if key == "sent_lengths":
            continue

        feature_list = [d[key] for d in batch]
        if key == "sent_rep_token_ids":
            feature_list = pad(feature_list, -1)
            sent_rep_token_ids = torch.tensor(feature_list)

            sent_rep_mask = ~(sent_rep_token_ids == -1)
            sent_rep_token_ids[sent_rep_token_ids == -1] = 0

            final_dictionary["sent_rep_token_ids"] = sent_rep_token_ids
            final_dictionary["sent_rep_mask"] = sent_rep_mask
            continue  # go to next key
        if key == "input_ids":
            input_ids = feature_list

            # Attention
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [[1] * len(ids) for ids in input_ids]

            input_ids_width = max([len(ids) for ids in input_ids])
            input_ids = pad(input_ids, 0, width=input_ids_width)
            input_ids = torch.tensor(input_ids)
            attention_mask = pad(attention_mask, 0)
            attention_mask = torch.tensor(attention_mask)

            if "sent_lengths" in elem:
                sent_lengths = []
                sent_lengths_mask = []
                sent_lengths_width = max([len(d["sent_lengths"]) + 1 for d in batch])
                for d in batch:
                    current_sent_lens = d["sent_lengths"]
                    current_sent_lengths_mask = [True] * len(current_sent_lens)
                    num_to_add = sent_lengths_width - len(current_sent_lens)
                    total_value_to_add = input_ids_width - sum(current_sent_lens)
                    while num_to_add > 1:
                        num_to_add -= 1
                        # total_value_to_add -= 1
                        current_sent_lens.append(0)
                        current_sent_lengths_mask.append(False)
                    # if a value needs to be added to make `sum(current_sent_lens)` the total input
                    # sequence length OR there is one more number to add (this can happen if the input
                    # sequence exactly ends with a sentence, making the total of the lengths the length
                    # of the sequence, or if there is one sentence that takes up the entire sequence)
                    if total_value_to_add > 0 or num_to_add == 1:
                        current_sent_lens.append(total_value_to_add)
                        current_sent_lengths_mask.append(False)

                    sent_lengths.append(current_sent_lens)
                    sent_lengths_mask.append(current_sent_lengths_mask)
                final_dictionary["sent_lengths"] = sent_lengths
                final_dictionary["sent_lengths_mask"] = torch.tensor(sent_lengths_mask)

            final_dictionary["input_ids"] = input_ids
            final_dictionary["attention_mask"] = attention_mask

            continue

        if key in ("source", "target"):
            final_dictionary[key] = feature_list
            continue

        if key in ("labels", "token_type_ids"):
            feature_list = pad(feature_list, 0)

        feature_list = torch.tensor(feature_list)
        final_dictionary[key] = feature_list

    if modifier:
        final_dictionary = modifier(final_dictionary)

    return final_dictionary
Example #27
0
    def get_features(
        self,
        tokenizer,
        bert_compatible_cls=True,
        create_sent_rep_token_ids=True,
        sent_rep_token_id=None,
        create_sent_lengths=True,
        create_segment_ids="binary",
        segment_token_id=None,
        create_source=False,
        n_process=2,
        max_length=None,
        pad_on_left=False,
        pad_token=0,
        mask_padding_with_zero=True,
        create_attention_mask=True,
        pad_ids_and_attention=True,
        return_type=None,
        save_to_path=None,
        save_to_name=None,
        save_as_type="txt",
    ):
        r"""Convert the examples stored by the ``SentencesProcessor`` to features that can be used by
        a model. The following processes can be performed: tokenization, token type ids (to separate
        sentences), sentence representation token ids (the locations of each sentence representation
        token), sentence lengths, and the attention mask. Padding can be applied to the tokenized
        examples and the attention masks but it is recommended to instead use the
        :meth:`data.pad_batch_collate` function so each batch is padded individually for efficiency
        (less zeros passed through model).

        Arguments:
            tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the examples.
            bert_compatible_cls (bool, optional): Adds '[CLS]' tokens in front of each sentence. This is useful
                so that the '[CLS]' token can be used to obtain sentence
                embeddings. This only works if the chosen model has the '[CLS]'
                token in its vocabulary. Default is True.
            create_sent_rep_token_ids (bool, optional): Option to create sentence representation token ids. This
                will store a list of the indexes of all the ``sent_rep_token_id``\ s
                in the tokenized example. Default is True.
            sent_rep_token_id ([type], optional): The token id that should be captured for each sentence (should have
                one per sentence and each should represent that sentence).
                Default is ``'[CLS]' token if bert_compatible_cls else '[SEP]' token``.
            create_sent_lengths (bool, optional): Option to create a list of sentence lengths where each index in
                the list coresponds to the respective sentence in the example. Default is True.
            create_segment_ids (str, optional): Option to create segment ids (aka token type ids).
                See https://huggingface.co/transformers/glossary.html#token-type-ids for more info.
                Set to either "binary", "sequential", or False.

                * ``binary`` alternates between 0 and 1 for each sentence.
                * ``sequential`` starts at 0 and increments by 1 for each sentence.
                * ``False`` does not create any segment ids.

                Note: Many pretrained models that accept token type ids use them
                for question answering ans related tasks where the model receives
                two inputs. Therefore, most models have a token type id vocabulary
                size of 2, which means they only have learned 2 token type ids. The
                "binary" mode exists so that these pretrained models can easily
                be used.
                Default is "binary".
            segment_token_id (str, optional): The token id to be used when creating segment ids. Can be set to 'period'
                to treat periods as sentence separation tokens, but this is a terrible
                idea for obvious reasons. Default is '[SEP]' token id.
            create_source (bool, optional): Option to save the source text (non-tokenized) as a string. Default is False.
            n_process (int, optional): How many processes to use for multithreading for running get_features_process().
                Set higher to run faster and set lower is you experience OOM issues. Default is 2.
            max_length (int, optional): If ``pad_ids_and_attention`` is True then pad to this amount. Default is ``tokenizer.max_len``.
            pad_on_left (bool, optional): Optionally, pad on the left instead of right. Default is False.
            pad_token (int, optional): Which token to use for padding the ``input_ids``. Default is 0.
            mask_padding_with_zero (bool, optional): Use zeros to pad the attention. Uses ones otherwise. Default is True.
            create_attention_mask (bool, optional): Option to create the attention mask. It is recommended to use
                the :meth:`data.pad_batch_collate` function, which will automatically create
                attention masks and pad them on a per batch level. Default is ``False if return_type == "lists" else True``.
            pad_ids_and_attention (bool, optional): Pad the ``input_ids`` with ``pad_token`` and attention masks
                with 0s or 1s deneding on ``mask_padding_with_zero``. Pad both to
                ``max_length``. Default is ``False if return_type == "lists" else True``
            return_type (str, optional): Either "tensors", "lists", or None. See "Returns" section below. Default is None.
            save_to_path (str, optional): The folder/directory to save the data to OR None to not save.
                Will save the data specified by ``return_type`` to disk. Default is None.
            save_to_name (str, optional): The name of the file to save. The extension '.pt' is automatically
                appended. Default is ``'dataset_' + self.name + '.pt'``.
            save_as_type (str, optional): The file extension of saved file if `save_to_path` is set. Supports "pt" (PyTorch)
                and "txt" (Text). Saving as "txt" requires the ``return_type`` to be ``lists``. If ``return_type`` is
                ``tensors`` the only ``save_as_type`` available is "pt". Defaults to "txt".

        Returns:
            list or torch.TensorDataset: If ``return_type is None`` return the list of calculated
            features. If ``return_type == "tensors"`` return the features converted to tensors
            and stacked such that features are grouped together into individual tensors. If
            ``return_type == "lists"``, which is the recommended option then exports each
            ``InputFeatures`` object in the exported ``features`` list as a dictionary and appends each
            dictionary to a list. Returns that list.
        """
        assert return_type in ["tensors", "lists"] or return_type is None
        assert save_as_type in ["txt", "pt"] or save_to_path is None
        if save_as_type == "txt":
            assert return_type == "lists"
        if return_type == "tensors":
            assert save_as_type == "pt" or save_to_path is None
        if return_type == "lists":
            create_attention_mask = False
            pad_ids_and_attention = False
        else:  # if `return_type` is None  or "tensors"
            create_attention_mask = True
            pad_ids_and_attention = True

        if max_length is None:
            max_length = tokenizer.model_max_length

        # batch_length = max(len(input_ids) for input_ids in all_input_ids)

        if create_sent_rep_token_ids:
            if sent_rep_token_id == "sep":  # get the sep token id
                sent_rep_token_id = tokenizer.sep_token_id
            elif sent_rep_token_id == "cls":  # get the cls token id
                sent_rep_token_id = tokenizer.cls_token_id
            elif not sent_rep_token_id:  # if the `sent_rep_token_id` is not set
                # if using `bert_compatible_cls` then default to the `cls_token_id`
                if bert_compatible_cls:
                    sent_rep_token_id = tokenizer.cls_token_id
                else:  # otherwise, get the `sep_token_id`
                    sent_rep_token_id = tokenizer.sep_token_id

        if create_segment_ids:
            if segment_token_id == "period":  # get the token id for a "."
                segment_token_id = tokenizer.convert_tokens_to_ids(["."])[0]
            elif (
                not segment_token_id
            ):  # default to trying to get the `sep_token_id` if the `segment_token_id` is not set
                segment_token_id = tokenizer.sep_token_id

        features = []
        pool = Pool(n_process)
        _get_features_process = partial(
            self.get_features_process,
            num_examples=len(self.labels),
            tokenizer=tokenizer,
            bert_compatible_cls=bert_compatible_cls,
            sep_token=tokenizer.sep_token,
            cls_token=tokenizer.cls_token,
            create_sent_rep_token_ids=create_sent_rep_token_ids,
            sent_rep_token_id=sent_rep_token_id,
            create_sent_lengths=create_sent_lengths,
            create_segment_ids=create_segment_ids,
            segment_token_id=segment_token_id,
            create_source=create_source,
            max_length=max_length,
            pad_on_left=pad_on_left,
            pad_token=pad_token,
            mask_padding_with_zero=mask_padding_with_zero,
            create_attention_mask=create_attention_mask,
            pad_ids_and_attention=pad_ids_and_attention,
        )

        for rtn_features in pool.map(
            _get_features_process,
            zip(range(len(self.labels)), self.examples, self.labels),
        ):
            features.append(rtn_features)

        pool.close()
        pool.join()

        if not return_type:
            return features
        elif return_type == "tensors":
            final_tensors = []

            all_input_ids = torch.tensor(
                [f.input_ids for f in features], dtype=torch.long
            )
            final_tensors.append(all_input_ids)
            all_attention_masks = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long
            )
            final_tensors.append(all_attention_masks)
            all_labels = torch.tensor(
                pad([f.labels for f in features], 0), dtype=torch.long
            )
            final_tensors.append(all_labels)

            if create_segment_ids:
                all_token_type_ids = torch.tensor(
                    pad([f.token_type_ids for f in features], 0), dtype=torch.long
                )
                final_tensors.append(all_token_type_ids)
            # Pad sentence representation token ids (`sent_rep_token_ids`)
            if create_sent_rep_token_ids:
                all_sent_rep_token_ids = torch.tensor(
                    pad([f.sent_rep_token_ids for f in features], -1), dtype=torch.long
                )
                all_sent_rep_token_ids_masks = ~(all_sent_rep_token_ids == -1)
                all_sent_rep_token_ids[all_sent_rep_token_ids == -1] = 0
                final_tensors.append(all_sent_rep_token_ids)
                final_tensors.append(all_sent_rep_token_ids_masks)

                if create_sent_lengths:
                    all_sent_lengths = torch.tensor(
                        pad([f.sent_lengths for f in features], 0), dtype=torch.long
                    )
                    final_tensors.append(all_sent_lengths)

            dataset = torch.utils.data.TensorDataset(*final_tensors)

        elif return_type == "lists":
            dataset = [example.to_dict() for example in features]

        if save_to_path:
            final_save_name = save_to_name if save_to_name else ("dataset_" + self.name)
            dataset_path = os.path.join(
                save_to_path, (final_save_name + "." + save_as_type),
            )
            logger.info("Saving dataset into cached file %s", dataset_path)
            if save_as_type == "txt":
                with open(dataset_path, "w+") as file:
                    # Need to replace single with double quotes so it can be loaded as JSON
                    file.write(
                        "\n".join([json.dumps(x) for x in dataset]) + "\n"
                    )
            elif save_as_type == "pt":
                torch.save(dataset, dataset_path)
            else:
                logger.error("'%s' is an invalid save type.", save_as_type)

        return dataset
Example #28
0
 def start_activity(self):
     self.udp_socket.sendto(
         TEAM_NAME + DISCOVER + (helpers.pad(40)).encode() + bytes([0]),
         (IP_BROADCAST, SERVER_PORT))  # TODO: Missing fields
     self.wait_for_servers()