Esempio n. 1
0
def link(obj_code):
    # build symbol/relocation tables
    symtbl = SymbolTable(False)
    reltbls = []
    for i in range(0, len(obj_code)):
        reltbls += [SymbolTable(True)]
    build_tables(obj_code, symtbl, reltbls)
    # print(symtbl.to_string())
    # Find .text section of input
    byte_off = 0
    line_num = 0
    output = []
    errors = []
    index = 0
    for obj_file in obj_code:
        start, end = find_text_block(obj_file)
        for line in obj_file[start:end]:
            try:
                line_num += 1
                # write instruction out
                instruction = int(line, 16)
                if inst_needs_relocation(instruction):
                    instruction = relocate_inst(instruction, byte_off, symtbl,
                                                reltbls[index])
                write_inst_hex(output, instruction)
            except AssemblerException as e:
                errors += [(line_num, e)]
            byte_off += 4
        index += 1
        byte_off = 0
    if len(errors) > 0:
        print("Errors during linking:")
        for line_num, e in errors:
            print("Error: line {0}: {1}".format(line_num, e))
    return output
Esempio n. 2
0
def addScope():
    if not scope_stack:
        print "scope stack empty: global symbol table not initialised"
    else:
        curr_scope = scope_stack[-1]
        new_scope = SymbolTable(curr_scope)
        global scope_label
        scope_label += 1
        new_scope.label = scope_label
        scope_stack.append(new_scope)
        scope_list.append(new_scope)
Esempio n. 3
0
 def _preprocess_data(self, sentence_data, init=True):
     # Initialize word table and populate with embeddings
     if init:
         self.word_dict = SymbolTable()
         for word in self.embedding_words:
             self.word_dict.get(word)
     # Process data
     return [
         map_words_to_symbols(s, self.word_dict.lookup, self.ngrams)
         for s in sentence_data
     ]
Esempio n. 4
0
 def _preprocess_data(self, sentence_data, init=True):
     # Initialize word table and populate with embeddings
     if init:
         self.word_dict = SymbolTable()
     # Process data
     mapper = self.word_dict.get if init else self.word_dict.lookup
     tokens = [
         map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data
     ]
     if init:
         self.td = self.word_dict.num_symbols()
     return tokens
Esempio n. 5
0
def main():
    """
    The driver for the Jack syntax analyser. Responsible for setting up and
    invoking Initialiser, JackTokeniser, SymbolTable, and CompilationEngine. 
    """
    # Pass Initialiser the cli to generate array of .jack files for compilation
    initialiser = Initialiser(sys.argv[1])

    # Generate dict of input files for translation and output files for writing
    file_names = {}
    for input_file in initialiser.files:
        vm_file = input_file.replace('jack', 'vm')
        file_names[input_file] = vm_file

    # Compile every .jack file and write to output
    for input_file, output_file in file_names.items():
        # Tokenise the input
        tokeniser = JackTokeniser(input_file)
        # Create a symbol table for the Jack class
        symbol_table = SymbolTable(tokeniser)
        # Create a vm_writer
        vm_writer = VMWriter(output_file)
        # Prepare the compilation engine, compile the class, and close
        engine = CompilationEngine(tokeniser, symbol_table, vm_writer)
        engine.compile_class()
        vm_writer.close()
Esempio n. 6
0
class LinearModel(SHALOModelVectorMean, SHALOModelFixed):
    """Linear model over pretrained embeddings"""

    name = 'LinearModel'

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self.word_dict = SymbolTable()
            for word in self.embedding_words:
                self.word_dict.get(word)
        # Process data
        return [
            map_words_to_symbols(s, self.word_dict.lookup, self.ngrams)
            for s in sentence_data
        ]
Esempio n. 7
0
class SHALOModelPreTrain(SHALOModel):

    name = 'SHALOModelPreTrain'

    def __init__(self, embedding_file, save_file=None, n_threads=None):
        SHALOModel.__init__(self, save_file, n_threads)
        with open(embedding_file, 'rb') as f:
            self.embedding_words, self.embeddings = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        """Get training words and init word table with pre-embedded words"""
        self._get_training_words(training_sentences)
        self.word_dict = SymbolTable()
        for word in self.embedding_words_train:
            self.word_dict.get(word)

    def _get_training_words(self, training_sentences):
        """Get training words and subset of pre-embedded words in train set"""
        unique_words = set(w for s in training_sentences for w in s)
        embedding_idxs_train, self.embedding_words_train = [], []
        for i, word in enumerate(self.embedding_words):
            if word in unique_words:
                self.embedding_words_train.append(word)
                embedding_idxs_train.append(i)
        idxs = np.ravel(embedding_idxs_train)
        self.embeddings_train = self.embeddings[idxs, :]

    def _get_embedding(self):
        """
        Return embedding tensor (either constant or variable)
        Row 0 is 0 vector for no token
        Row 1 is random initialization for UNKNOWN
        Rows 2 : 2 + len(self.embedding_words) are pretrained initialization
        Remaining rows are random initialization
        """
        zero = tf.constant(0.0, dtype=tf.float32, shape=(1, self.d))
        s = self.seed - 1
        unk = tf.Variable(tf.random_normal((1, self.d), stddev=SD, seed=s))
        pretrain = tf.Variable(self.embeddings_train, dtype=tf.float32)
        vecs = [zero, unk, pretrain]
        n_r = self.word_dict.num_words() - len(self.embedding_words_train)
        if n_r > 0:
            r = tf.Variable(tf.random_normal((n_r, self.d), stddev=SD, seed=s))
            vecs.append(r)
        self.U = tf.concat(vecs, axis=0, name='embedding_matrix')
        return self.U
Esempio n. 8
0
 def _preprocess_data(self, sentence_data, init=True):
     # Initialize word table
     if init:
         self.word_dict = SymbolTable()
     # Process data
     mapper = self.word_dict.get if init else self.word_dict.lookup
     return [
         map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data
     ]
Esempio n. 9
0
 def __init__(self, save_file=None, name='RNNBase', seed=None, n_threads=4):
     """Base class for bidirectional RNN"""
     # Define metadata
     self.mx_len = None  # Max sentence length
     self.dim = None  # Embedding dimension
     self.n_v = None  # Vocabulary size
     self.lr = None  # Learning rate
     self.attn = None  # Attention window
     self.cell = None  # RNN cell type
     self.word_dict = SymbolTable()  # Symbol table for dictionary
     # Define input layers
     self.sentences = None
     self.sentence_lengths = None
     self.train_marginals = None
     self.keep_prob = None
     self.seed = seed
     # Super constructor
     super(RNNBase, self).__init__(n_threads=n_threads,
                                   save_file=save_file,
                                   name=name)
Esempio n. 10
0
def assemble(input_file):
    cleaned = [
        strip_comments(line).strip()
        for line in utils.read_file_to_list(input_file)
    ]
    asm = [line for line in cleaned if line != ""]
    symtbl = SymbolTable(False)
    reltbl = SymbolTable(True)
    # Pass One
    intermediate, errors_one = pass_one(asm, symtbl)
    # Pass Two
    output, errors_two = pass_two(intermediate, symtbl, reltbl)

    if len(errors_one) > 0:
        print("Errors during pass one:")
        for line_num, e in errors_one:
            print("Error: line {0}: {1}".format(line_num, e))
    if len(errors_two) > 0:
        print("Errors during pass two:")
        for line_num, e in errors_two:
            print("Error: line {0}: {1}".format(line_num, e))
    if len(errors_one) > 0 or len(errors_two) > 0:
        print("One or more errors encountered during assembly operation")
    return intermediate, output
Esempio n. 11
0
 def _preprocess_data(self, candidates, extend=False):
     """Convert candidate sentences to lookup sequences
     
     :param candidates: candidates to process
     :param extend: extend symbol table for tokens (train), or lookup (test)?
     """
     if not hasattr(self, 'word_dict'):
         self.word_dict = SymbolTable()
     data, ends = [], []
     for candidate in candidates:
         toks = candidate.get_contexts()[0].text.split()
         # Either extend word table or retrieve from it
         f = self.word_dict.get if extend else self.word_dict.lookup
         data.append(np.array(map(f, toks)))
         ends.append(len(toks))
     return data, ends
Esempio n. 12
0
 def _preprocess_data(self, candidates, extend=False):
     """Convert candidate sentences to tagged symbol sequences
         @candidates: candidates to process
         @extend: extend symbol table for tokens (train), or lookup (test)?
     """
     if not hasattr(self, 'word_dict'):
         self.word_dict = SymbolTable()
     data, ends = [], []
     for candidate in candidates:
         # Read sentence data
         tokens = candidate_to_tokens(candidate)
         # Get label sequence
         labels = np.zeros(len(tokens), dtype=int)
         labels[c[0].get_word_start():c[0].get_word_end() + 1] = 1
         # Tag sequence
         s = tag(tokens, labels)
         # Either extend word table or retrieve from it
         f = self.word_dict.get if extend else self.word_dict.lookup
         data.append(np.array(map(f, s)))
         ends.append(c[0].get_word_end())
     return data, ends
Esempio n. 13
0
 def _preprocess_data(self, candidates, extend=False):
     """Convert candidate sentences to lookup sequences
     
     :param candidates: candidates to process
     :param extend: extend symbol table for tokens (train), or lookup (test)?
     """
     if not hasattr(self, 'word_dict'):
         self.word_dict = SymbolTable()
     data, ends = [], []
     for candidate in candidates:
         # Mark sentence
         args = [
             (candidate[0].get_word_start(), candidate[0].get_word_end(), 1),
             (candidate[1].get_word_start(), candidate[1].get_word_end(), 2)
         ]
         s = mark_sentence(candidate_to_tokens(candidate), args)
         # Either extend word table or retrieve from it
         f = self.word_dict.get if extend else self.word_dict.lookup
         data.append(np.array(map(f, s)))
         ends.append(max(candidate[i].get_word_end() for i in [0, 1]))
     return data, ends
Esempio n. 14
0
class SparseLinearModel(SHALOModelRandInit):
    """Sparse linear model over BOW indicator vector"""

    name = 'SparseLinearModel'

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self.word_dict = SymbolTable()
        # Process data
        mapper = self.word_dict.get if init else self.word_dict.lookup
        tokens = [
            map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data
        ]
        if init:
            self.td = self.word_dict.num_symbols()
        return tokens

    def _get_data_batch(self, x_batch):
        # Construct LIL matrix
        X_lil = sparse.lil_matrix((len(x_batch), self.td))
        for j, x in enumerate(x_batch):
            for t in x:
                X_lil[j, t] += 1
        # Get batch data
        indices, ids, weights = [], [], []
        max_len = 0
        for i, (row, data) in enumerate(zip(X_lil.rows, X_lil.data)):
            # Dummy weight for all-zero row
            if len(row) == 0:
                indices.append((i, 0))
                ids.append(0)
                weights.append(0.0)
                continue
            # Update indices by position
            max_len = max(max_len, len(row))
            indices.extend((i, t) for t in xrange(len(row)))
            ids.extend(row)
            weights.extend(data)
        shape = (len(X_lil.rows), max_len)
        return [indices, shape, ids, weights], None

    def _get_feed(self, x_batch, len_batch, y_batch=None):
        indices, shape, ids, weights = x_batch
        feed = {
            self.indices: indices,
            self.shape: shape,
            self.ids: ids,
            self.weights: weights,
        }
        if y_batch is not None:
            feed[self.y] = y_batch
        return feed

    def _build(self):
        assert (self.lr is not None)
        assert (self.l2_penalty is not None)
        assert (self.loss_function is not None)
        # Define input placeholders
        self.indices = tf.placeholder(tf.int64)
        self.shape = tf.placeholder(tf.int64, (2, ))
        self.ids = tf.placeholder(tf.int64)
        self.weights = tf.placeholder(tf.float32)
        self.y = tf.placeholder(tf.float32, (None, ))
        # Define training variables
        sparse_ids = tf.SparseTensor(self.indices, self.ids, self.shape)
        sparse_vals = tf.SparseTensor(self.indices, self.weights, self.shape)
        s1, s2 = self.seed, (self.seed + 1 if self.seed is not None else None)
        w = tf.Variable(tf.random_normal((self.td, 1), stddev=0.01, seed=s1))
        b = tf.Variable(tf.random_normal((1, 1), stddev=0.01, seed=s2))
        z = tf.nn.embedding_lookup_sparse(params=w,
                                          sp_ids=sparse_ids,
                                          sp_weights=sparse_vals,
                                          combiner='sum')
        h = tf.squeeze(tf.add(z, b))
        # Define training procedure
        self.loss = self._get_loss(h, self.y)
        self.loss += self.l2_penalty * tf.nn.l2_loss(w)
        self.prediction = tf.sigmoid(h)
        self.train_fn = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        self.save_dict = self._get_save_dict(w=w, b=b)
Esempio n. 15
0
class RNNBase(TFNoiseAwareModel):

    representation = True

    def __init__(self, save_file=None, name='RNNBase', seed=None, n_threads=4):
        """Base class for bidirectional RNN"""
        # Define metadata
        self.mx_len = None  # Max sentence length
        self.dim = None  # Embedding dimension
        self.n_v = None  # Vocabulary size
        self.lr = None  # Learning rate
        self.attn = None  # Attention window
        self.cell = None  # RNN cell type
        self.word_dict = SymbolTable()  # Symbol table for dictionary
        # Define input layers
        self.sentences = None
        self.sentence_lengths = None
        self.train_marginals = None
        self.keep_prob = None
        self.seed = seed
        # Super constructor
        super(RNNBase, self).__init__(n_threads=n_threads,
                                      save_file=save_file,
                                      name=name)

    def _preprocess_data(self, candidates, extend):
        """Build @self.word_dict to encode and process data for extraction
            Return list of encoded sentences and list of last index of arguments
        """
        raise NotImplementedError()

    def _check_max_sentence_length(self, ends):
        """Check that extraction arguments are within @self.mx_len"""
        mx = self.mx_len
        for i, end in enumerate(ends):
            if end >= mx:
                w = "Candidate {0} has argument past max length for model:"
                info = "[arg ends at index {0}; max len {1}]".format(end, mx)
                warnings.warn('\t'.join([w.format(i), info]))

    def _make_tensor(self, x):
        """Construct input tensor with padding
            Builds a matrix of symbols corresponding to @self.word_dict for the
            current batch and an array of true sentence lengths
        """
        batch_size = len(x)
        x_batch = np.zeros((batch_size, self.mx_len), dtype=np.int32)
        len_batch = np.zeros(batch_size, dtype=np.int32)
        for j, token_ids in enumerate(x):
            t = min(len(token_ids), self.mx_len)
            x_batch[j, 0:t] = token_ids[0:t]
            len_batch[j] = t
        return x_batch, len_batch

    def _embedding_init(self, s):
        """Random initialization for embedding table"""
        return tf.random_normal((self.n_v - 1, self.dim), stddev=SD, seed=s)

    def _build(self):
        """Get feed forward step, loss function, and optimizer for RNN"""
        # Define input layers
        self.sentences = tf.placeholder(tf.int32, [None, None])
        self.sentence_lengths = tf.placeholder(tf.int32, [None])
        self.train_marginals = tf.placeholder(tf.float32, [None])
        self.keep_prob = tf.placeholder(tf.float32)
        # Seeds
        s = self.seed
        s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)]
        # Embedding layer
        emb_var = tf.Variable(self._embedding_init(s1))
        embedding = tf.concat([tf.zeros([1, self.dim]), emb_var], axis=0)
        inputs = tf.nn.embedding_lookup(embedding, self.sentences)
        # Build RNN graph
        batch_size = tf.shape(self.sentences)[0]
        rand_name = "RNN_{0}".format(random.randint(0, 1e12))  # Obscene hack
        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(rand_name, reuse=False, initializer=init):
            # Build RNN cells
            fw_cell = self.cell(self.dim)
            bw_cell = self.cell(self.dim)
            # Add attention if needed
            if self.attn:
                fw_cell = rnn.AttentionCellWrapper(fw_cell,
                                                   self.attn,
                                                   state_is_tuple=True)
                bw_cell = rnn.AttentionCellWrapper(bw_cell,
                                                   self.attn,
                                                   state_is_tuple=True)
            # Construct RNN
            initial_state_fw = fw_cell.zero_state(batch_size, tf.float32)
            initial_state_bw = bw_cell.zero_state(batch_size, tf.float32)
            rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                inputs,
                sequence_length=self.sentence_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                time_major=False)
        # Get potentials
        potentials = get_bi_rnn_output(rnn_out, self.dim,
                                       self.sentence_lengths)
        # Compute activation
        potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3)
        W = tf.Variable(tf.random_normal((2 * self.dim, 1), stddev=SD,
                                         seed=s4))
        b = tf.Variable(0., dtype=tf.float32)
        h_dropout = tf.squeeze(tf.matmul(potentials_dropout, W)) + b
        # Noise-aware loss
        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=self.train_marginals, logits=h_dropout))
        # Backprop trainer
        self.train_fn = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        # Get prediction
        self.prediction = tf.nn.sigmoid(h_dropout)

    def train(self,
              candidates,
              marginals,
              n_epochs=25,
              lr=0.01,
              dropout=0.5,
              dim=50,
              attn_window=None,
              cell_type=rnn.BasicLSTMCell,
              batch_size=256,
              max_sentence_length=None,
              rebalance=False,
              dev_candidates=None,
              dev_labels=None,
              print_freq=5):
        """Train bidirectional RNN model for binary classification
            @candidates: list of Candidate objects for training
            @marginals: array of marginal probabilities for each Candidate
            @n_epochs: number of training epochs
            @lr: learning rate
            @dropout: keep probability for dropout layer (no dropout if None)
            @dim: embedding dimension
            @attn_window: attention window length (no attention if 0 or None)
            @cell_type: subclass of tensorflow.python.ops.rnn_cell_impl._RNNCell
            @batch_size: batch size for mini-batch SGD
            @max_sentence_length: maximum sentence length for candidates
            @rebalance: bool or fraction of positive examples for training
                        - if True, defaults to standard 0.5 class balance
                        - if False, no class balancing
            @dev_candidates: list of Candidate objects for evaluation
            @dev_labels: array of labels for each dev Candidate
            @print_freq: number of epochs after which to print status
        """
        verbose = print_freq > 0
        if verbose:
            print("[{0}] Dimension={1}  LR={2}".format(self.name, dim, lr))
            print("[{0}] Begin preprocessing".format(self.name))
            st = time()
        # Text preprocessing
        train_data, ends = self._preprocess_data(candidates, extend=True)
        # Get training indices
        np.random.seed(self.seed)
        train_idxs = LabelBalancer(marginals).get_train_idxs(rebalance)
        x_train = [train_data[j] for j in train_idxs]
        y_train = np.ravel(marginals)[train_idxs]
        # Get max sentence size
        self.mx_len = max_sentence_length or max(len(x) for x in x_train)
        self._check_max_sentence_length(ends)
        # Build model
        self.dim = dim
        self.lr = lr
        self.n_v = self.word_dict.len()
        self.attn = attn_window
        self.cell = cell_type
        self._build()
        # Get dev data
        dev_data, dev_gold = None, None
        if dev_candidates is not None and dev_labels is not None:
            dev_data, _ = self._preprocess_data(dev_candidates, extend=False)
            dev_gold = np.ravel(dev_labels)
            if not ((dev_gold >= 0).all() and (dev_gold <= 1).all()):
                raise Exception("Dev labels should be in [0, 1]")
            print("[{0}] Loaded {1} candidates for evaluation".format(
                self.name, len(dev_data)))
        # Run mini-batch SGD
        n = len(x_train)
        batch_size = min(batch_size, n)
        if verbose:
            print("[{0}] Preprocessing done ({1:.2f}s)".format(
                self.name,
                time() - st))
            st = time()
            print("[{0}] Training model".format(self.name))
            print("[{0}] #examples={1}  #epochs={2}  batch size={3}".format(
                self.name, n, n_epochs, batch_size))
        self.session.run(tf.global_variables_initializer())
        for t in range(n_epochs):
            epoch_loss = []
            for i in range(0, n, batch_size):
                # Get batch tensors
                x_b, len_b = self._make_tensor(x_train[i:i + batch_size])
                y_b = y_train[i:i + batch_size]
                # Run training step and evaluate loss function
                epoch_loss.append(
                    self.session.run(
                        [self.loss, self.train_fn], {
                            self.sentences: x_b,
                            self.sentence_lengths: len_b,
                            self.train_marginals: y_b,
                            self.keep_prob: dropout or 1.0,
                        })[0])
            # Print training stats
            if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]):
                msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format(
                    self.name, t,
                    time() - st, np.mean(epoch_loss))
                if dev_data is not None:
                    dev_p = self._marginals_preprocessed(dev_data)
                    f1, _, _ = f1_score(dev_p, dev_gold)
                    msg += '\tDev F1={0:.2f}'.format(100. * f1)
                print msg
        if verbose:
            print("[{0}] Training done ({1:.2f}s)".format(
                self.name,
                time() - st))

    def _marginals_preprocessed(self, test_data):
        """Get marginals from preprocessed data"""
        x, x_len = self._make_tensor(test_data)
        return np.ravel(
            self.session.run(
                self.prediction, {
                    self.sentences: x,
                    self.sentence_lengths: x_len,
                    self.keep_prob: 1.0,
                }))

    def marginals(self, test_candidates):
        """Get likelihood of tagged sequences represented by test_candidates
            @test_candidates: list of lists representing test sentence
        """
        test_data, ends = self._preprocess_data(test_candidates, extend=False)
        self._check_max_sentence_length(ends)
        return self._marginals_preprocessed(test_data)
Esempio n. 16
0
class TTBB(SHALOModelFixed):
    """Implementation of A Simple but Tough-to-Beat Baseline for Sent. Embedding
    In the basic model, the common component vector is computed before all
    computations. The embeddings are static, so no updates are made.
    """

    name = 'TTBB'

    def __init__(self,
                 embedding_file,
                 word_freq_file,
                 save_file=None,
                 n_threads=None):
        SHALOModelFixed.__init__(self, embedding_file, save_file, n_threads)
        # Get marginals file
        with open(word_freq_file, 'rb') as f:
            self.word_freq = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        self.word_dict = SymbolTable()
        for word in self.embedding_words:
            self.word_dict.get(word)

    def _get_mapper(self, init):
        return self.word_dict.lookup

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self._word_table_init(sentence_data)
        # Process data
        # Map tokens and return if not initializing
        mapper = self._get_mapper(init)
        tokens = [
            np.ravel(map_words_to_symbols(s, mapper, self.ngrams))
            for s in sentence_data
        ]
        self.train_tokens = tokens
        if not init:
            return tokens
        # If initializing, get marginal estimates
        self.marginals = np.zeros(self.word_dict.num_symbols())
        for word, idx in self.word_dict.d.iteritems():
            # Try getting word frequency directly
            if word in self.word_freq:
                self.marginals[idx] = self.word_freq[word]
            # Otherwise, try getting minimum frequency among sub-grams
            split_grams = word.split(GRAMSEP)
            if len(split_grams) > 1:
                min_freq = min(self.word_freq.get(w, 0.0) for w in split_grams)
                self.marginals[idx] = min_freq
        # Get initial smoother value
        self.a = self.train_kwargs.get('a', -3.0)
        return tokens

    def _compute_train_common_component(self, init=False):
        if init:
            self.session.run(tf.global_variables_initializer())
        x_array, x_len = self._get_data_batch(self.train_tokens)
        self.ccx = self.session.run(self.tf_ccx, {
            self.input: x_array,
            self.input_lengths: x_len
        })
        return self.ccx

    def _get_a_exp(self):
        return tf.constant(self.a, dtype=tf.float32)

    def _get_common_component(self):
        self.ccx = self._compute_train_common_component(init=True)
        return tf.constant(self.ccx, dtype=tf.float32)

    def _embed_sentences(self):
        """Tensorflow implementation of Simple but Tough-to-Beat Baseline"""
        # Get word features
        word_embeddings = self._get_embedding()
        word_feats = tf.nn.embedding_lookup(word_embeddings, self.input)
        # Get marginal estimates and scaling term
        batch_size = tf.shape(word_feats)[0]
        a = tf.pow(10.0, self._get_a_exp())
        p = tf.constant(self.marginals, dtype=tf.float32, name='marginals')
        q = tf.reshape(a / (a + tf.nn.embedding_lookup(p, self.input)),
                       (batch_size, self.mx_len, 1))
        # Compute initial sentence embedding
        z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1))
        S = z * tf.reduce_sum(q * word_feats, axis=1)
        # Compute common component
        S_centered = S - tf.reduce_mean(S, axis=0)
        _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True)
        self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0))
        # Common component removal
        ccx = tf.reshape(self._get_common_component(), (1, self.d))
        sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx}
        return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv
Esempio n. 17
0
 def _word_table_init(self, training_sentences):
     self.word_dict = SymbolTable()
     for word in self.embedding_words:
         self.word_dict.get(word)
Esempio n. 18
0
 def _word_table_init(self, training_sentences):
     """Get training words and init word table with pre-embedded words"""
     self._get_training_words(training_sentences)
     self.word_dict = SymbolTable()
     for word in self.embedding_words_train:
         self.word_dict.get(word)
Esempio n. 19
0
class CRFTextRNN(RNNBase):
    """RNN for sequence labeling of strings of text."""
    def _preprocess_data(self,
                         candidates,
                         marginals=None,
                         dev_labels=None,
                         extend=False,
                         shuffle_data=False):
        """Convert candidate sentences to lookup sequences

        :param candidates: candidates to process
        :param extend: extend symbol table for tokens (train),
            or lookup (test)?
        """
        if not hasattr(self, 'word_dict'):
            self.word_dict = SymbolTable()

        if not hasattr(self, 'char_dict'):
            self.char_dict = SymbolTable()

        max_word_len = 0
        data, ends, sent_buf, words, word_buf, sents = [], [], [], [], [], []
        for candidate in candidates:
            tok = candidate.get_contexts()[1].text
            index = candidate.get_contexts()[2].text

            if sent_buf and index == '0':
                f = self.word_dict.get if extend else self.word_dict.lookup
                data.append(np.array(map(f, sent_buf)))
                sents.append(sent_buf)
                ends.append(len(sent_buf))
                sent_buf = []

                c = self.char_dict.get if extend else self.char_dict.lookup
                sent_words = [np.array(map(c, chars)) for chars in word_buf]
                words.append(np.array(sent_words))
                word_buf = []

            sent_buf.append(tok)
            word_buf.append(list(tok))
            max_word_len = max(max_word_len, len(tok))

        marg = []
        if marginals is not None:
            cand_idx = 0
            for sent_len in ends:
                end_idx = cand_idx + sent_len
                marg.append(marginals[cand_idx:end_idx, :])
                cand_idx = end_idx
            marg = np.array(marg)

        aligned_dev_labels = []
        if dev_labels is not None:
            cand_idx = 0
            for sent_len in ends:
                end_idx = cand_idx + sent_len
                aligned_dev_labels.append(dev_labels[cand_idx:end_idx])
                cand_idx = end_idx
            aligned_dev_labels = np.array(aligned_dev_labels)

        if shuffle_data:
            indexes = np.arange(len(data))
            np.random.shuffle(indexes)
            data = np.array(data)[indexes]
            sents = np.array(sents)[indexes]
            ends = np.array(ends)[indexes]
            if marginals is not None:
                marg = marg[indexes]
            if dev_labels is not None:
                aligned_dev_labels = aligned_dev_labels[indexes]
            if words:
                words = np.array(words)[indexes]
            print('Shuffled data for LSTM')

        words = words if len(words) > 0 else None
        return data, ends, marg, aligned_dev_labels, words, max_word_len, sents

    def _build_model(self,
                     dim=50,
                     dim_char=50,
                     attn_window=None,
                     max_len=20,
                     cell_type=tf.contrib.rnn.BasicLSTMCell,
                     max_word_len=10,
                     word_dict=SymbolTable(),
                     char_dict=SymbolTable(),
                     **kwargs):

        # Set the word dictionary passed in as the word_dict for the instance
        self.max_len = max_len
        self.word_dict = word_dict
        vocab_size = word_dict.len()

        self.max_word_len = max_word_len
        self.char_dict = char_dict
        n_chars = char_dict.len()

        # Define input layers
        self.sentences = tf.placeholder(tf.int32, [None, None])
        self.sentence_lengths = tf.placeholder(tf.int32, [None])

        # Seeds
        s = self.seed
        s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)]

        # Embedding layer
        emb_var = tf.Variable(
            tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1))
        embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0)
        inputs = tf.nn.embedding_lookup(embedding, self.sentences)

        # Character embedding
        # shape = (batch_size, max_sent_len, max_word_len)
        self.words = tf.placeholder(tf.int32, [None, None, None])
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None])

        char_var = tf.get_variable(name='char_embeddings',
                                   dtype=tf.float32,
                                   shape=[n_chars, dim_char])
        char_embedding = tf.nn.embedding_lookup(char_var, self.words)

        char_s = tf.shape(char_embedding)
        # shape = (batch x sentence, word, dim of char embeddings)
        char_embedding = tf.reshape(
            char_embedding,
            shape=[char_s[0] * char_s[1], char_s[-2], dim_char])
        word_lengths = tf.reshape(self.word_lengths,
                                  shape=[char_s[0] * char_s[1]])

        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(self.name + '_char',
                               reuse=False,
                               initializer=init):
            char_fw_cell = cell_type(dim_char, state_is_tuple=True)
            char_bw_cell = cell_type(dim_char, state_is_tuple=True)

            _, ((_, char_fw_out),
                (_, char_bw_out)) = tf.nn.bidirectional_dynamic_rnn(
                    char_fw_cell,
                    char_bw_cell,
                    char_embedding,
                    sequence_length=word_lengths,
                    dtype=tf.float32)
        char_out = tf.concat([char_fw_out, char_bw_out], axis=-1)
        char_rep = tf.reshape(char_out, shape=[-1, char_s[1], 2 * dim_char])
        # inputs = tf.concat([inputs, char_rep], axis=-1)

        # Add dropout layer
        # self.keep_prob = tf.placeholder(tf.float32)
        # inputs_dropout = tf.nn.dropout(inputs, self.keep_prob, seed=s3)
        self.in_keep_prob = tf.placeholder(tf.float32)
        inputs_dropout = tf.nn.dropout(inputs, self.in_keep_prob, seed=s3)

        # Build RNN graph
        batch_size = tf.shape(self.sentences)[0]
        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(self.name, reuse=False, initializer=init):
            # Build RNN cells
            fw_cell = cell_type(dim)
            bw_cell = cell_type(dim)
            # Add attention if needed
            if attn_window:
                fw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    fw_cell, attn_window, state_is_tuple=True)
                bw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    bw_cell, attn_window, state_is_tuple=True)
            # Construct RNN
            initial_state_fw = fw_cell.zero_state(batch_size, tf.float32)
            initial_state_bw = bw_cell.zero_state(batch_size, tf.float32)
            # rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
            #     fw_cell, bw_cell, inputs,
            #     sequence_length=self.sentence_lengths,
            #     initial_state_fw=initial_state_fw,
            #     initial_state_bw=initial_state_bw,
            #     time_major=False
            # )
            rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                inputs_dropout,
                sequence_length=self.sentence_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                time_major=False)
        potentials, ntime_steps = get_bi_rnn_seq_output(
            rnn_out, dim, self.sentence_lengths)

        # Add dropout layer
        # potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3)
        self.out_keep_prob = tf.placeholder(tf.float32)
        potentials_dropout = tf.nn.dropout(potentials,
                                           self.out_keep_prob,
                                           seed=s3)

        # Build activation layer
        self.Y = tf.placeholder(tf.float32, [None, None, self.cardinality])
        self.train_labels = tf.placeholder(tf.int32, [None, self.max_len])

        W = tf.Variable(
            tf.random_normal((2 * dim, self.cardinality), stddev=SD, seed=s4))
        b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32)
        # self.logits = tf.matmul(potentials, W) + b
        self.logits = tf.matmul(potentials_dropout, W) + b
        self.logits = tf.reshape(self.logits,
                                 [-1, ntime_steps, self.cardinality])
        # self.marginals_op = tf.nn.softmax(self.logits)

        self.pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)

    def _build_training_ops(self, **training_kwargs):

        # batch_size = tf.shape(self.logits)[0]
        # seq_len = tf.shape(self.logits)[1]
        # self.Y = tf.cast(tf.argmax(self.Y, axis=2), tf.int32)
        # self.Y = tf.reshape(self.Y, [batch_size, seq_len])
        #
        # log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
        #     self.logits, self.Y, self.sentence_lengths)
        # self.loss = tf.reduce_mean(-log_likelihood)

        # self.pred, viterbi_score = tf.contrib.crf.viterbi_decode(
        #     self.logits, self.transition_params)

        losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,
                                                         labels=self.Y)

        # losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
        #     logits=self.logits, labels=self.train_labels)

        mask = tf.sequence_mask(self.sentence_lengths)
        losses = tf.boolean_mask(losses, mask)

        self.loss = tf.reduce_mean(losses)

        # Build training op
        self.lr = tf.placeholder(tf.float32)
        self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

    def _construct_feed_dict(self,
                             X_b,
                             Y_b,
                             lr=0.01,
                             dropout=None,
                             train_labels=None,
                             chars=None,
                             dropout_in=None,
                             dropout_out=None,
                             **kwargs):
        X_b, len_b, Y_b, L_b, C_b, len_c = self._make_tensor(
            X_b, Y_b, train_labels, chars)

        return {
            self.sentences:
            X_b,
            self.sentence_lengths:
            len_b,
            self.Y:
            Y_b,
            # self.keep_prob:        dropout or 1.0,
            self.in_keep_prob:
            dropout_in or 1.0,
            self.out_keep_prob:
            dropout_out or 1.0,
            self.lr:
            lr,
            self.train_labels:
            L_b,
            self.words:
            C_b,
            self.word_lengths:
            len_c
        }

    def _make_tensor(self, x, y=None, z=None, c=None):
        """Construct input tensor with padding
            Builds a matrix of symbols corresponding to @self.word_dict for the
            current batch and an array of true sentence lengths
        """
        batch_size = len(x)
        x_batch = np.zeros((batch_size, self.max_len), dtype=np.int32)
        y_batch = np.zeros((batch_size, self.max_len, self.cardinality))
        z_batch = np.zeros((batch_size, self.max_len), dtype=np.int32)
        c_batch = np.zeros((batch_size, self.max_len, self.max_word_len),
                           dtype=np.int32)
        len_batch = np.zeros(batch_size, dtype=np.int32)
        len_words = np.zeros((batch_size, self.max_len), dtype=np.int32)

        if c is not None and y is None and z is None:
            for j, (token_ids, words) in enumerate(zip(x, c)):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                len_batch[j] = t

                for x, y in enumerate(words[0:t]):
                    c_batch[j][x][0:len(y)] = y

                char_t = np.array([
                    min(len(word_ids), self.max_word_len) for word_ids in words
                ])
                len_words[j][0:len(char_t)] = char_t

        elif c is not None and y is not None and z is None:
            for j, (token_ids, marginals, words) in enumerate(zip(x, y, c)):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                y_batch[j, 0:t] = marginals[0:t]
                len_batch[j] = t

                for x, y in enumerate(words[0:t]):
                    c_batch[j][x][0:len(y)] = y

                char_t = np.array([
                    min(len(word_ids), self.max_word_len) for word_ids in words
                ])
                len_words[j][0:len(char_t)] = char_t

        elif c is not None:
            for j, (token_ids, marginals, labels,
                    words) in enumerate(zip(x, y, z, c)):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                y_batch[j, 0:t] = marginals[0:t]
                z_batch[j, 0:t] = labels[0:t]
                len_batch[j] = t

                for x, y in enumerate(words[0:t]):
                    c_batch[j][x][0:len(y)] = y

                char_t = np.array([
                    min(len(word_ids), self.max_word_len) for word_ids in words
                ])
                len_words[j][0:len(char_t)] = char_t

        elif z is not None:
            for j, (token_ids, marginals, labels) in enumerate(zip(x, y, z)):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                y_batch[j, 0:t] = marginals[0:t]
                z_batch[j, 0:t] = labels[0:t]
                len_batch[j] = t

        elif y is not None:
            for j, (token_ids, marginals) in enumerate(zip(x, y)):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                y_batch[j, 0:t] = marginals[0:t]
                len_batch[j] = t

        else:
            for j, token_ids in enumerate(x):
                t = min(len(token_ids), self.max_len)
                x_batch[j, 0:t] = token_ids[0:t]
                len_batch[j] = t

        return x_batch, len_batch, y_batch, z_batch, c_batch, len_words

    def predictions(self, X, b=0.5, batch_size=None, words=None):

        if isinstance(X[0], Candidate):
            X_test, ends, _, _, pwords, _, _ = self._preprocess_data(
                X, extend=False)
            words = pwords if words is not None else None
            self._check_max_sentence_length(ends)
        else:
            X_test = X

        # Make tensor and run prediction op
        x, x_len, _, _, _words, _words_len = self._make_tensor(X_test, c=words)
        pred = self.session.run(
            self.pred,
            {
                self.sentences: x,
                self.sentence_lengths: x_len,
                # self.keep_prob:        1.0,
                self.in_keep_prob: 1.0,
                self.out_keep_prob: 1.0,
                self.words: _words,
                self.word_lengths: _words_len
            })

        # logit_scores = self.session.run(self.logits, {
        #     self.sentences:        x,
        #     self.sentence_lengths: x_len,
        #     self.keep_prob:        1.0,
        #     self.words:            _words,
        #     self.word_lengths:     _words_len
        # })

        # preds = []
        # for logits in logit_scores:
        #     pred_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logits,
        #                                                             self.transition_params)
        #     preds.append(pred_seq)

        return pred
        # return preds

    def score(self,
              X_test,
              Y_test,
              b=0.5,
              set_unlabeled_as_neg=True,
              beta=1,
              batch_size=None,
              other_id=-1,
              out_path='predictions.txt',
              ids_to_classes=None,
              use_chars=False):

        # predictions, viterbi_score = self.predictions(X_test, b, batch_size)
        # pred_words = [self.word_dict.reverse()[i] for i in predictions]
        # try:
        #     Y_test = np.array(Y_test.todense()).reshape(-1)
        # except:
        #     Y_test = np.array(Y_test)

        # correct = np.where([predictions == Y_test])[0].shape[0]
        # return correct / float(Y_test.shape[0])

        X_test, ends, _, _, words, _, sents = self._preprocess_data(
            X_test, extend=False)
        self._check_max_sentence_length(ends)

        words = words if use_chars else None
        predictions = self.predictions(X_test,
                                       b=b,
                                       batch_size=batch_size,
                                       words=words)

        # # Convert Y_test to dense numpy array
        # try:
        #     Y_test = np.array(Y_test.todense()).reshape(-1)
        # except:
        #     Y_test = np.array(Y_test)

        labels = []
        cand_idx = 0
        for sent_len in ends:
            end_idx = cand_idx + sent_len
            labels.append(Y_test[cand_idx:end_idx])
            cand_idx = end_idx
        Y_test = np.array(labels)

        correct = 0
        # correct = np.where([predictions == Y_test])[0].shape[0]
        # return correct / float(Y_test.shape[0])

        token_err, sent_err = 0, 0
        token_num, sent_num = 0, len(Y_test)
        other_total, other_as_class = 0, 0
        class_total, class_as_other = 0, 0

        ids_to_words = self.word_dict.reverse()
        # with open(out_path, 'w') as out:

        preds_final = []

        for sent_pred, sent_gold, sent in zip(predictions, Y_test, sents):
            # for sent_pred, sent_gold, sent in zip(predictions, Y_test, X_test):
            pred_err = 0
            preds_final_sent = []

            for tag_pred, tag_gold, token in zip(sent_pred, sent_gold, sent):
                token_num += 1

                if tag_gold == other_id:
                    other_total += 1
                else:
                    class_total += 1

                if tag_pred == tag_gold:
                    correct += 1

                if tag_pred != tag_gold:
                    pred_err += 1

                    if tag_pred == other_id:
                        class_as_other += 1
                    if tag_gold == other_id:
                        other_as_class += 1

                if tag_pred > self.cardinality:
                    print('PREDICTION ({}) / CARDINALITY MISMATCH ({})'.format(
                        tag_pred, self.cardinality))

                else:
                    # word = ids_to_words.get(token, None)
                    word = token
                    if ids_to_classes is not None:
                        # In Snorkel, class IDs have to start at 1 because 0 is the reserved value for abstaining
                        # labeling functions. There is no abstention in TensorFlow, i.e. classes have to be zero-indexed.
                        class_pred = ids_to_classes.get(tag_pred + 1, None)
                        # class_gold = ids_to_classes.get(tag_gold + 1, None)
                    else:
                        class_pred = tag_pred
                        # class_gold = tag_gold

                    preds_final_sent.append((word, class_pred))

                    # out.write('{}\t{}\t{}'.format(word, class_pred, class_gold))
                    # out.write('\n')

            token_err += pred_err
            if pred_err != 0:
                sent_err += 1

                # out.write('\n')

            preds_final.append(preds_final_sent)

        if other_total == 0:
            other_total = 1
        if class_total == 0:
            class_total = 1

        return float(correct) / token_num, \
            float(token_err) / token_num, float(sent_err) / sent_num, \
            float(other_as_class) / other_total, float(class_as_other) / class_total, \
            preds_final

    def train(self,
              X_train,
              Y_train,
              dev_labels=None,
              X_dev=None,
              max_sentence_length=None,
              shuffle=False,
              max_word_length=None,
              **kwargs):
        """
        Perform preprocessing of data, construct dataset-specific model, then
        train.
        """
        # Text preprocessing
        X_train, ends, Y_train, train_labels, train_words, max_word_len, _ = self._preprocess_data(
            X_train,
            Y_train,
            dev_labels=dev_labels,
            extend=True,
            shuffle_data=shuffle)
        if X_dev is not None:
            X_dev, _, _, _, _, _, _ = self._preprocess_data(X_dev, [],
                                                            extend=False)

        # Get max sentence size
        max_len = max_sentence_length or max(len(x) for x in X_train)
        self._check_max_sentence_length(ends, max_len=max_len)
        max_word_len = max_word_length or max_word_len

        # Train model- note we pass word_dict through here so it gets saved...
        # super(RNNBase, self).train(X_train, Y_train, X_dev=X_dev,
        #                            word_dict=self.word_dict, max_len=max_len, train_labels=train_labels, **kwargs)
        self._train(X_train,
                    Y_train,
                    X_dev=X_dev,
                    words=train_words,
                    char_dict=self.char_dict,
                    word_dict=self.word_dict,
                    max_len=max_len,
                    dev_labels=train_labels,
                    max_word_len=max_word_len,
                    **kwargs)

    def _train(self,
               X_train,
               Y_train,
               dev_labels=None,
               words=None,
               n_epochs=25,
               lr=0.01,
               batch_size=256,
               rebalance=False,
               X_dev=None,
               Y_dev=None,
               print_freq=5,
               dev_ckpt=True,
               dev_ckpt_delay=0.75,
               save_dir='checkpoints',
               **kwargs):
        """
        Generic training procedure for TF model

        :param X_train: The training Candidates. If self.representation is True, then
            this is a list of Candidate objects; else is a csr_AnnotationMatrix
            with rows corresponding to training candidates and columns
            corresponding to features.
        :param Y_train: Array of marginal probabilities for each Candidate
        :param n_epochs: Number of training epochs
        :param lr: Learning rate
        :param batch_size: Batch size for SGD
        :param rebalance: Bool or fraction of positive examples for training
                    - if True, defaults to standard 0.5 class balance
                    - if False, no class balancing
        :param X_dev: Candidates for evaluation, same format as X_train
        :param Y_dev: Labels for evaluation, same format as Y_train
        :param print_freq: number of epochs at which to print status, and if present,
            evaluate the dev set (X_dev, Y_dev).
        :param dev_ckpt: If True, save a checkpoint whenever highest score
            on (X_dev, Y_dev) reached. Note: currently only evaluates at
            every @print_freq epochs.
        :param dev_ckpt_delay: Start dev checkpointing after this portion
            of n_epochs.
        :param save_dir: Save dir path for checkpointing.
        :param kwargs: All hyperparameters that change how the graph is built
            must be passed through here to be saved and reloaded to save /
            reload model. *NOTE: If a parameter needed to build the
            network and/or is needed at test time is not included here, the
            model will not be able to be reloaded!*
        """
        self._check_input(X_train)
        verbose = print_freq > 0

        # Set random seed for all numpy operations
        self.rand_state.seed(self.seed)

        # If the data passed in is a feature matrix (representation=False),
        # set the dimensionality here; else assume this is done by sub-class
        if not self.representation:
            kwargs['d'] = X_train.shape[1]

        if dev_labels is not None:
            if len(dev_labels) > 0:
                train_labels = copy.deepcopy(dev_labels)
            else:
                train_labels = None
        else:
            train_labels = None

        # Create new graph, build network, and start session
        self._build_new_graph_session(**kwargs)

        # Build training ops
        # Note that training_kwargs and model_kwargs are mixed together; ideally
        # would be separated but no negative effect
        with self.graph.as_default():
            self._build_training_ops(**kwargs)

        # Initialize variables
        with self.graph.as_default():
            self.session.run(tf.global_variables_initializer())

        # Run mini-batch SGD
        n = len(X_train) if self.representation else X_train.shape[0]
        batch_size = min(batch_size, n)
        if verbose:
            st = time()
            print("[{0}] Training model".format(self.name))
            print("[{0}] n_train={1}  #epochs={2}  batch size={3}".format(
                self.name, n, n_epochs, batch_size))
        dev_score_opt = 0.0
        for t in range(n_epochs):
            epoch_losses = []
            for i in range(0, n, batch_size):
                if train_labels is not None:
                    batch_labels = train_labels[i:min(n, i + batch_size)]
                else:
                    batch_labels = None

                if words is not None:
                    batch_words = words[i:min(n, i + batch_size)]
                else:
                    batch_words = None

                feed_dict = self._construct_feed_dict(
                    X_train[i:min(n, i + batch_size)],
                    Y_train[i:min(n, i + batch_size)],
                    train_labels=batch_labels,
                    chars=batch_words,
                    lr=lr,
                    **kwargs)
                # Run training step and evaluate loss function
                epoch_loss, _ = self.session.run([self.loss, self.optimizer],
                                                 feed_dict=feed_dict)
                epoch_losses.append(epoch_loss)

            # Reshuffle training data
            train_idxs = range(n)
            self.rand_state.shuffle(train_idxs)
            X_train = [X_train[j] for j in train_idxs] if self.representation \
                else X_train[train_idxs, :]
            Y_train = Y_train[train_idxs]

            if train_labels is not None:
                train_labels = [train_labels[j] for j in train_idxs]

            # Print training stats and optionally checkpoint model
            # if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]):
            #     msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format(
            #         self.name, t, time() - st, np.mean(epoch_losses))
            #     if train_labels is not None:
            #         dev_accurarcy, dev_token_err, dev_sent_err, dev_gold_other_err, dev_pred_other_err \
            #             = self.score(X_dev, train_labels, batch_size=batch_size, preprocess=False, **kwargs)
            #         print(msg)
            #         print('\tDev accuracy: ' + str(dev_accurarcy))
            #         print('\tDev token error rate: ' + str(dev_token_err))
            #         print('\tDev sentence error rate: ' + str(dev_sent_err))
            #         print('\tDev gold-annotated OTHER predicted as some class: ' + str(dev_gold_other_err))
            #         print('\tDev predicted OTHER gold-annotated as some class: ' + str(dev_pred_other_err))
            #     else:
            #         print(msg)

            if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]):
                msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format(
                    self.name, t,
                    time() - st, np.mean(epoch_losses))
                if X_dev is not None:
                    scores = self.score(X_train, Y_dev, batch_size=batch_size)
                    score = scores if self.cardinality > 2 else scores[-1]
                    score_label = "Acc." if self.cardinality > 2 else "F1"
                    msg += '\tDev {0}={1:.2f}'.format(score_label,
                                                      100. * score)
                print(msg)

                # If best score on dev set so far and dev checkpointing is
                # active, save checkpoint
                if X_dev is not None and dev_ckpt and \
                        t > dev_ckpt_delay * n_epochs and score > dev_score_opt:
                    dev_score_opt = score
                    self.save(save_dir=save_dir, global_step=t)

        # Conclude training
        if verbose:
            print("[{0}] Training done ({1:.2f}s)".format(
                self.name,
                time() - st))

        # If checkpointing on, load last checkpoint (i.e. best on dev set)
        if dev_ckpt and X_dev is not None and verbose and dev_score_opt > 0:
            self.load(save_dir=save_dir)
Esempio n. 20
0
    temp_var_count += 1
    return new_temp
    # need to remove this from variable lists??


label_count = 0


def newLabel():
    global label_count
    new_label = 'label' + str(label_count)
    label_count += 1
    return new_label


global_symbol_table = SymbolTable(None)
scope_stack.append(global_symbol_table)
scope_list.append(global_symbol_table)

precedence = (('right', 'EQUAL', 'NOT'), ('left', 'OROR'), ('left', 'AMPAMP'),
              ('left', 'EQEQ', 'NOTEQ', 'LESS', 'GREAT', 'LEQ',
               'GEQ'), ('left', 'PLUS', 'MINUS', 'OR',
                        'CARET'), ('left', 'TIMES', 'DIVIDE', 'MOD', 'LL',
                                   'GG', 'AMPERS', 'AMPCAR'))

#-------------------------------Start------------------------------#


def p_start(p):
    '''start : Source'''
    p[0] = p[1]
Esempio n. 21
0
    def _build_model(self,
                     dim=50,
                     attn_window=None,
                     max_len=20,
                     cell_type=tf.contrib.rnn.BasicLSTMCell,
                     word_dict=SymbolTable(),
                     **kwargs):
        """
        Build RNN model
        
        :param dim: embedding dimension
        :param attn_window: attention window length (no attention if 0 or None)
        :param cell_type: subclass of tensorflow.python.ops.rnn_cell_impl._RNNCell
        :param batch_size: batch size for mini-batch SGD
        :param vocab_size: Vocab size for determining size of word embeddings tensor
        """
        # Set the word dictionary passed in as the word_dict for the instance
        self.max_len = max_len
        self.word_dict = word_dict
        vocab_size = word_dict.len()

        # Define input layers
        self.sentences = tf.placeholder(tf.int32, [None, None])
        self.sentence_lengths = tf.placeholder(tf.int32, [None])

        # Seeds
        s = self.seed
        s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)]

        # Embedding layer
        emb_var = tf.Variable(
            tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1))
        embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0)
        inputs = tf.nn.embedding_lookup(embedding, self.sentences)

        # Build RNN graph
        batch_size = tf.shape(self.sentences)[0]
        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(self.name, reuse=False, initializer=init):
            # Build RNN cells
            fw_cell = cell_type(dim)
            bw_cell = cell_type(dim)
            # Add attention if needed
            if attn_window:
                fw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    fw_cell, attn_window, state_is_tuple=True)
                bw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    bw_cell, attn_window, state_is_tuple=True)
            # Construct RNN
            initial_state_fw = fw_cell.zero_state(batch_size, tf.float32)
            initial_state_bw = bw_cell.zero_state(batch_size, tf.float32)
            rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                inputs,
                sequence_length=self.sentence_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                time_major=False)
        potentials = get_bi_rnn_output(rnn_out, dim, self.sentence_lengths)

        # Add dropout layer
        self.keep_prob = tf.placeholder(tf.float32)
        potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3)

        # Build activation layer
        if self.cardinality > 2:
            self.Y = tf.placeholder(tf.float32, [None, self.cardinality])
            W = tf.Variable(
                tf.random_normal((2 * dim, self.cardinality),
                                 stddev=SD,
                                 seed=s4))
            b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32)
            self.logits = tf.matmul(potentials, W) + b
            self.marginals_op = tf.nn.softmax(self.logits)
        else:
            self.Y = tf.placeholder(tf.float32, [None])
            W = tf.Variable(tf.random_normal((2 * dim, 1), stddev=SD, seed=s4))

            # TODO: Implement for categorical as well...
            if self.deterministic:
                # TODO: Implement for categorical as well...
                if self.cardinality > 2:
                    raise NotImplementedError(
                        "Deterministic mode not implemented for categoricals.")

                # Make deterministic
                # See: https://github.com/tensorflow/tensorflow/pull/10636/files
                b = tf.Variable(np.zeros([1]), dtype=tf.float32)
                f_w = tf.matmul(potentials, W)
                f_w_temp = tf.concat([f_w, tf.ones_like(f_w)], axis=1)
                b_temp = tf.stack([tf.ones_like(b), b], axis=0)
                self.logits = tf.squeeze(tf.matmul(f_w_temp, b_temp))
            else:
                b = tf.Variable(0., dtype=tf.float32)
                self.logits = tf.squeeze(tf.matmul(potentials, W)) + b

            self.marginals_op = tf.nn.sigmoid(self.logits)
Esempio n. 22
0
    def _preprocess_data(self,
                         candidates,
                         marginals=None,
                         dev_labels=None,
                         extend=False,
                         shuffle_data=False):
        """Convert candidate sentences to lookup sequences

        :param candidates: candidates to process
        :param extend: extend symbol table for tokens (train),
            or lookup (test)?
        """
        if not hasattr(self, 'word_dict'):
            self.word_dict = SymbolTable()

        if not hasattr(self, 'char_dict'):
            self.char_dict = SymbolTable()

        max_word_len = 0
        data, ends, sent_buf, words, word_buf, sents = [], [], [], [], [], []
        for candidate in candidates:
            tok = candidate.get_contexts()[1].text
            index = candidate.get_contexts()[2].text

            if sent_buf and index == '0':
                f = self.word_dict.get if extend else self.word_dict.lookup
                data.append(np.array(map(f, sent_buf)))
                sents.append(sent_buf)
                ends.append(len(sent_buf))
                sent_buf = []

                c = self.char_dict.get if extend else self.char_dict.lookup
                sent_words = [np.array(map(c, chars)) for chars in word_buf]
                words.append(np.array(sent_words))
                word_buf = []

            sent_buf.append(tok)
            word_buf.append(list(tok))
            max_word_len = max(max_word_len, len(tok))

        marg = []
        if marginals is not None:
            cand_idx = 0
            for sent_len in ends:
                end_idx = cand_idx + sent_len
                marg.append(marginals[cand_idx:end_idx, :])
                cand_idx = end_idx
            marg = np.array(marg)

        aligned_dev_labels = []
        if dev_labels is not None:
            cand_idx = 0
            for sent_len in ends:
                end_idx = cand_idx + sent_len
                aligned_dev_labels.append(dev_labels[cand_idx:end_idx])
                cand_idx = end_idx
            aligned_dev_labels = np.array(aligned_dev_labels)

        if shuffle_data:
            indexes = np.arange(len(data))
            np.random.shuffle(indexes)
            data = np.array(data)[indexes]
            sents = np.array(sents)[indexes]
            ends = np.array(ends)[indexes]
            if marginals is not None:
                marg = marg[indexes]
            if dev_labels is not None:
                aligned_dev_labels = aligned_dev_labels[indexes]
            if words:
                words = np.array(words)[indexes]
            print('Shuffled data for LSTM')

        words = words if len(words) > 0 else None
        return data, ends, marg, aligned_dev_labels, words, max_word_len, sents
Esempio n. 23
0
    def _build_model(self,
                     dim=50,
                     dim_char=50,
                     attn_window=None,
                     max_len=20,
                     cell_type=tf.contrib.rnn.BasicLSTMCell,
                     max_word_len=10,
                     word_dict=SymbolTable(),
                     char_dict=SymbolTable(),
                     **kwargs):

        # Set the word dictionary passed in as the word_dict for the instance
        self.max_len = max_len
        self.word_dict = word_dict
        vocab_size = word_dict.len()

        self.max_word_len = max_word_len
        self.char_dict = char_dict
        n_chars = char_dict.len()

        # Define input layers
        self.sentences = tf.placeholder(tf.int32, [None, None])
        self.sentence_lengths = tf.placeholder(tf.int32, [None])

        # Seeds
        s = self.seed
        s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)]

        # Embedding layer
        emb_var = tf.Variable(
            tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1))
        embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0)
        inputs = tf.nn.embedding_lookup(embedding, self.sentences)

        # Character embedding
        # shape = (batch_size, max_sent_len, max_word_len)
        self.words = tf.placeholder(tf.int32, [None, None, None])
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None])

        char_var = tf.get_variable(name='char_embeddings',
                                   dtype=tf.float32,
                                   shape=[n_chars, dim_char])
        char_embedding = tf.nn.embedding_lookup(char_var, self.words)

        char_s = tf.shape(char_embedding)
        # shape = (batch x sentence, word, dim of char embeddings)
        char_embedding = tf.reshape(
            char_embedding,
            shape=[char_s[0] * char_s[1], char_s[-2], dim_char])
        word_lengths = tf.reshape(self.word_lengths,
                                  shape=[char_s[0] * char_s[1]])

        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(self.name + '_char',
                               reuse=False,
                               initializer=init):
            char_fw_cell = cell_type(dim_char, state_is_tuple=True)
            char_bw_cell = cell_type(dim_char, state_is_tuple=True)

            _, ((_, char_fw_out),
                (_, char_bw_out)) = tf.nn.bidirectional_dynamic_rnn(
                    char_fw_cell,
                    char_bw_cell,
                    char_embedding,
                    sequence_length=word_lengths,
                    dtype=tf.float32)
        char_out = tf.concat([char_fw_out, char_bw_out], axis=-1)
        char_rep = tf.reshape(char_out, shape=[-1, char_s[1], 2 * dim_char])
        # inputs = tf.concat([inputs, char_rep], axis=-1)

        # Add dropout layer
        # self.keep_prob = tf.placeholder(tf.float32)
        # inputs_dropout = tf.nn.dropout(inputs, self.keep_prob, seed=s3)
        self.in_keep_prob = tf.placeholder(tf.float32)
        inputs_dropout = tf.nn.dropout(inputs, self.in_keep_prob, seed=s3)

        # Build RNN graph
        batch_size = tf.shape(self.sentences)[0]
        init = tf.contrib.layers.xavier_initializer(seed=s2)
        with tf.variable_scope(self.name, reuse=False, initializer=init):
            # Build RNN cells
            fw_cell = cell_type(dim)
            bw_cell = cell_type(dim)
            # Add attention if needed
            if attn_window:
                fw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    fw_cell, attn_window, state_is_tuple=True)
                bw_cell = tf.contrib.rnn.AttentionCellWrapper(
                    bw_cell, attn_window, state_is_tuple=True)
            # Construct RNN
            initial_state_fw = fw_cell.zero_state(batch_size, tf.float32)
            initial_state_bw = bw_cell.zero_state(batch_size, tf.float32)
            # rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
            #     fw_cell, bw_cell, inputs,
            #     sequence_length=self.sentence_lengths,
            #     initial_state_fw=initial_state_fw,
            #     initial_state_bw=initial_state_bw,
            #     time_major=False
            # )
            rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                inputs_dropout,
                sequence_length=self.sentence_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                time_major=False)
        potentials, ntime_steps = get_bi_rnn_seq_output(
            rnn_out, dim, self.sentence_lengths)

        # Add dropout layer
        # potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3)
        self.out_keep_prob = tf.placeholder(tf.float32)
        potentials_dropout = tf.nn.dropout(potentials,
                                           self.out_keep_prob,
                                           seed=s3)

        # Build activation layer
        self.Y = tf.placeholder(tf.float32, [None, None, self.cardinality])
        self.train_labels = tf.placeholder(tf.int32, [None, self.max_len])

        W = tf.Variable(
            tf.random_normal((2 * dim, self.cardinality), stddev=SD, seed=s4))
        b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32)
        # self.logits = tf.matmul(potentials, W) + b
        self.logits = tf.matmul(potentials_dropout, W) + b
        self.logits = tf.reshape(self.logits,
                                 [-1, ntime_steps, self.cardinality])
        # self.marginals_op = tf.nn.softmax(self.logits)

        self.pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)