def __init__(self, embedding, units, softmax, timestep, x_word, y_word): self.embedding = check.check_instance(embedding, HiddenState) self.units = check.check_dict(units) self.softmax = check.check_instance(softmax, LabelDistribution) self.timestep = timestep self.x_word = x_word self.y_word = y_word
def __init__(self, model_dir, versions={}, latest=None, step=-1): self.model_dir = check.check_instance(model_dir, str) self.save_path = self.get_save_path(self.model_dir) self.versions = check.check_instance(versions, dict) self.latest = latest self.step = check.check_instance(step, int) self.next_step = self.step + 1
def train(self, xys_stream, training_parameters): check.check_instance(training_parameters, mlbase.TrainingParameters) slot_length = len(str(training_parameters.epochs())) - 1 epoch_template = "[%s] Epoch training {:%dd}: (loss, perplexity): {:.6f}, {:.6f}" % ( self.scope, slot_length) final_loss = None epochs_tenth = max(1, int(training_parameters.epochs() / 10)) losses = training_parameters.losses() finished = False epoch = -1 while not finished: epoch += 1 epoch_loss = 0 # Start at a different offset for every epoch to help avoid overfitting. offset = random.randint(0, training_parameters.batch() - 1) batch = [] first = True batch_set = False count = 0 for xy in xys_stream(): batch += [xy] if first and len(batch) == offset: first = False batch_set = True elif len(batch) == training_parameters.batch(): batch_set = True if batch_set: count += len(batch) feed = self.get_training_feed(batch, training_parameters) _, training_loss = self.session.run( [self.updates, self.cost], feed_dict=feed) epoch_loss += training_loss batch_set = False batch = [] if len(batch) > 0: count += len(batch) feed = self.get_training_feed(batch, training_parameters) _, training_loss = self.session.run([self.updates, self.cost], feed_dict=feed) epoch_loss += training_loss epoch_loss /= count epoch_perplexity = math.exp(epoch_loss) losses.append(epoch_loss) finished, reason = training_parameters.finished(epoch, losses) if not finished and epoch % epochs_tenth == 0: logging.debug( epoch_template.format(epoch, epoch_loss, epoch_perplexity)) logging.debug( epoch_template.format(epoch, epoch_loss, epoch_perplexity)) logging.debug("Training on %d instances finished due to %s (%s)." % (count, reason, losses)) return epoch_loss, -epoch_perplexity
def __init__(self, all_nodes, kind): if len(all_nodes) > 0: identifier_class = all_nodes[0].identifier.__class__ for node in all_nodes: check.check_instance(node.identifier, identifier_class) check.check_equal(node.finalized, True) self.all_nodes = all_nodes self.kind = check.check_one_of(kind, [Graph.DIRECTED, Graph.UNDIRECTED]) self.log_len = math.log10(len(self.all_nodes) + 1) self.indexes = {} for node in self.all_nodes: self.indexes[node.identifier] = node self.clustering_coefficients = self._calculate_clustering_coefficients( ) self._distances = {} self._max_distances = {} self._global_max_distance = None self._background_calculations = threading.Thread( target=self._submit_calculations) self._background_calculations.daemon = True self._background_calculations.start()
def __init__(self, sequence, predicted, expected): self.sequence = check.check_not_empty(sequence) self.predicted = check.check_instance(predicted, SequenceStats) self.expected = check.check_instance(expected, SequenceStats) check.check_length(self.predicted.values, len(self.sequence)) check.check_length(self.expected.values, len(self.sequence)) self.perplexity = perplexity(self.expected.probabilities)
def to_lemma(self, inflection_term): self._finalize() check.check_instance(inflection_term, Term) try: return self.inflections[inflection_term] except KeyError as e: return self.inflections[inflection_term.lower()]
def __init__(self, save_dir, versions={}, latest=None, next_step=0): self.save_dir = check.check_instance(save_dir, str) self.savepoints_file = os.path.join(self.save_dir, Savepoints.SAVEPOINTS_FILE) self.model_dir = os.path.join(self.save_dir, Savepoints.MODEL_DIR) self.versions = check.check_instance(versions, dict) self.latest = latest self.next_step = next_step
def sentences(word_token_stream): sentence_builder = SentenceBuilder() for token in word_token_stream: check.check_instance(token, Token) # Since we are streaming words in, we can complete the sentence at any time. # v if sentence_builder.process(token, can_complete=True): yield sentence_builder.build() sentence_builder = SentenceBuilder() if not sentence_builder.is_empty(): yield sentence_builder.build()
def __init__(self, model, save_dir): self.model = check.check_instance(model, Model) self.savepoints = Savepoints.load(save_dir) if self.savepoints is None: self.savepoints = Savepoints(save_dir) os.makedirs(save_dir, exist_ok=True)
def __init__(self, scope, output_labels, output_distribution): super(CustomOutput, self).__init__(scope) self.output_labels = check.check_instance(output_labels, mlbase.Labels) self.output_distribution = check.check_pdist(output_distribution) assert len(self.output_labels) == len( self.output_distribution), "%d != %d" % (len( self.output_labels), len(self.output_distribution))
def __init__(self, labels, array): self.labels = check.check_instance(labels, Labels) self.array = check.check_length(array, len(self.labels)) self._prediction = None self._prediction_probability = None self._distribution = None self._ranked_items = None
def __init__(self, values, unknown=None): check.check_instance(values, set) self.unknown = unknown self._empty = None self._encoding = {} self._decoding = {} if unknown is not None: self._encoding[unknown] = 0 self._decoding[0] = self.unknown i = len(self._encoding) for value in values: self._encoding[check.check_not_none(value)] = i self._decoding[i] = value i += 1
def _dump_stream(data, dir_path, converter): check.check_instance(data, queue.Queue) batch = [] batch_size = None i = 0 try_size = 10 while True: item = data.get() if item is not None: batch += [item] # If we're still building out the sample. if batch_size is None: # Only try to discover the batch_size every so often. if len(batch) % try_size == 0: average = _average_size(batch, converter) sample_size = average * len(batch) if sample_size > STREAM_TARGET_FILE_SIZE: batch_size = max(1, int(STREAM_TARGET_FILE_SIZE / average)) if len(batch) > 2 * try_size: # Notice we don't need to worry about this growing too large, because the next check upper bounds the batch size. try_size = try_size * 2 if batch_size is None and len(batch) == STREAM_MAX_BATCH: # The batch is plenty large enough - just set it here. batch_size = STREAM_MAX_BATCH else: # The batch_size has been determined. while len(batch) > batch_size: bytes_out = pickle.dumps(_convert(converter, batch[:batch_size])) _write_bytes(bytes_out, dir_path, i) i += 1 batch = batch[batch_size:] else: # The data stream is complete - flush the remaining data. if len(batch) > 0: bytes_out = pickle.dumps(_convert(converter, batch)) _write_bytes(bytes_out, dir_path, i) logging.debug("Completed pickling stream for '%s'." % dir_path) break
def extract_terms(corpus, terms_trie, lemmatizer=lambda x: x, inflection_recorder=lambda x, y: 0): check.check_instance(terms_trie, Node) tags = [tag for word, tag in nltk.pos_tag(corpus)] assert len(tags) == len(corpus) extracted_terms = set() i = 0 while i < len(corpus): span = 1 node = terms_trie lemma = lemmatizer(corpus[i]) tag = tags[i] sequence = None matched_term = None while lemma in node.children: if tag in TAGS and TAGS[tag]: node = node.children[lemma] if node.final: sequence = corpus[i:i + span] matched_term = node.term if i + span >= len(corpus): break lemma = lemmatizer(corpus[i + span]) tag = tags[i + span] span += 1 else: break if sequence is not None: inflection_term = Term(sequence) extracted_terms.add(matched_term) inflection_recorder(matched_term, inflection_term) i += len(sequence) else: i += 1 return extracted_terms
def record(self, lemma_term, inflection_term, number=1): check.check_none(self.lemma_to_inflection) logging.debug("record: %s->%s" % (lemma_term, inflection_term)) check.check_instance(lemma_term, Term) check.check_instance(inflection_term, Term) if lemma_term not in self.counts: self.counts[lemma_term] = {} count = self.counts[lemma_term].get(inflection_term, 0) self.counts[lemma_term][inflection_term] = count + number if inflection_term not in self.inflections: self.inflections[inflection_term] = lemma_term else: if self.inflections[inflection_term] != lemma_term: raise ValueError( "Inflection '%s' maps to multiple lemmas: [%s, %s]." % (inflection_term, self.inflections[inflection_term], lemma_term))
def __init__(self, name, name_no_t, vector, min_max=(None, None), colour=None, predictions=None, positioning=None): self.name = name self.name_no_t = name_no_t self.vector = [float(value) for value in vector] self.minimum, self.maximum = canonicalize_bounds(min_max, self.vector) self.colour = colour self.predictions = None if predictions is None else check.check_instance( predictions, LabelDistribution) self.positioning = positioning
def train(self, model_persistence, dataset, debug=False): check.check_instance(model_persistence, api.model.ModelPersistence) check.check_instance(model_persistence.model, api.model.IterativelyOptimized) check.check_instance(dataset, api.data.Dataset) train_account = TrainAccount(self.schedule.window_size) score = model_persistence.model.score(dataset.validate) train_account.baseline(score) logging.debug("Baseline validate score: %.4f" % (score)) model_persistence.save(train_account.version, {"score_validate": score}) training_parameters = self.parameters if debug: logging.debug("Training under: %s." % training_parameters) while True: finished, reason = self.schedule.is_finished(train_account) if finished: assert reason is not None, "when the schedule is finished it must provide a reason" logging.debug("Finished training: %s" % reason) break round_losses = self._optimization_round(model_persistence.model, dataset.train, training_parameters, debug) score = model_persistence.model.score(dataset.validate) progress_marker = self.schedule.evaluate_progress( round_losses, train_account.best_score, score) if progress_marker.improved: logging.debug("Progress improved - proceeding. Validate scores: previous=%.4f, current=%.4f." % \ (train_account.best_score, score)) train_account.record_round(round_losses, score, progress_marker) model_persistence.save(train_account.version, {"score_validate": score}) else: logging.debug("Progress did not improve - decaying. Validate scores: previous=%.4f, current=%.4f." % \ (train_account.best_score, score)) train_account.record_decay(training_parameters.learning_rate) model_persistence.load(train_account.version) training_parameters = self.schedule.decay( train_account, training_parameters) if debug: logging.debug("Training under: %s." % training_parameters) score_train = model_persistence.model.score(dataset.train) score_test = model_persistence.model.score(dataset.test) logging.debug("Final train / test scores: %.4f / %.4f" % (score_train, score_test))
def _optimization_round(self, model, trainstream, training_parameters, debug): check.check_instance(model, api.model.IterativelyOptimized) check.check_instance(trainstream, api.data.Datastream) check.check_instance(training_parameters, api.train.TrainingParameters) model_parameters = model.extract_parameters(training_parameters) randomized_trainstream = trainstream.as_randomized( training_parameters.batch_size * 4) slot_length = util.order_of_magnitude(training_parameters.epoch_size) epoch_template = "Epoch {:%dd} loss: {:.6f}" % slot_length epoch = -1 losses = [] while epoch + 1 < training_parameters.epoch_size: epoch += 1 epoch_loss = model.step_optimize(model_parameters, randomized_trainstream, training_parameters.batch_size) losses += [epoch_loss] if debug: logging.debug(epoch_template.format(epoch, epoch_loss)) return losses
def train(self, xy_sequences, training_parameters): check.check_instance(training_parameters, mlbase.TrainingParameters) if id(xy_sequences) != self._training_id: self._training_id = id(xy_sequences) # Sort the training sequences by their length to minimize padding (each batch will consist of roughly equal lengthed sequences). self.training_xys = sorted(xy_sequences, key=lambda xy: len(xy.x)) slot_length = len(str(training_parameters.epochs())) - 1 case_slot_length = len(str(len(xy_sequences))) epoch_template = "Epoch training {:%dd} (loss, perplexity): {:.6f}, {:.6f}" % slot_length + (" (score {:.6f})" if training_parameters.score() else "") epochs_tenth = max(1, int(training_parameters.epochs() / 10)) losses = training_parameters.losses() finished = False epoch = -1 while not finished: epoch += 1 epoch_loss = 0 # Start at a different offset for every epoch to help avoid overfitting. offset = random.randint(0, min(training_parameters.batch(), len(self.training_xys)) - 1) count = 0 first = True while offset < len(self.training_xys): if first: first = False batch = self.training_xys[0:offset] else: batch = self.training_xys[offset:offset + training_parameters.batch()] offset += training_parameters.batch() # To account for when offset is randomly assigned 0 if len(batch) > 0: count += len(batch) feed = self.get_training_feed(batch, training_parameters) _, loss = self.session.run([self.updates, self.cost], feed_dict=feed) #_, loss, logits, targets = self.session.run([self.updates, self.cost, self.logits, self.targets], feed_dict=feed) #_, loss, mask, uop1, lrs, mmm, mmn, mnn = self.session.run([self.updates, self.cost, self.mask, self.unrolled_outputs_p, self.losses_reduced, self.masked, self.masked2, self.masked3], feed_dict=feed) #_, loss, mask, uop1, tgs, lrs, mmm = self.session.run([self.updates, self.cost, self.mask, self.unrolled_outputs_p, self.targets, self.losses_reduced, self.masked], feed_dict=feed) #if epoch == 0: #print(mask) #print(uop1) #print(tgs) #print(lrs) #print(mmm) #print(mmn) #print(mnn) #print(dd) epoch_loss += loss assert count == len(xy_sequences), "%d != %d" % (count, len(xy_sequences)) epoch_loss /= count epoch_perplexity = math.exp(epoch_loss) losses.append(epoch_loss) finished, reason = training_parameters.finished(epoch, losses) if not finished and epoch % epochs_tenth == 0 and training_parameters.debug(): if training_parameters.score(): score = 0.0 offset = 0 while offset < len(xy_sequences): batch = xy_sequences[offset:offset + 32] offset += 32 feed = self.get_testing_feed(batch) time_distributions = self.session.run(self.output_distributions, feed_dict=feed) score += self.score(batch, feed, time_distributions, False, case_slot_length) logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity, score / len(xy_sequences))) else: logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity)) if training_parameters.score(): score = 0.0 offset = 0 while offset < len(xy_sequences): batch = xy_sequences[offset:offset + 32] offset += 32 feed = self.get_testing_feed(batch) time_distributions = self.session.run(self.output_distributions, feed_dict=feed) score += self.score(batch, feed, time_distributions, False, case_slot_length) logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity, score / len(xy_sequences))) else: logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity)) #logging.debug("Training finished due to %s (%s)." % (reason, losses)) return epoch_loss, -epoch_perplexity
def __init__(self, scope, hyper_parameters, extra, input_field, output_labels, case_field): super(SeparateFfnn, self).__init__(scope) self.hyper_parameters = check.check_instance(hyper_parameters, HyperParameters) self.extra = extra self.input_field = check.check_instance(input_field, mlbase.Field) self.output_labels = check.check_instance(output_labels, mlbase.Labels) self.case_field = check.check_instance(case_field, mlbase.Labels) batch_size_dimension = None # Notation: # _p placeholder # _c constant # Base variable setup self.input_p = self.placeholder( "input_p", [batch_size_dimension, len(self.input_field)]) self.input_cases_p = self.placeholder("input_cases_p", [batch_size_dimension], tf.int32) self.output_p = self.placeholder("output_p", [batch_size_dimension], tf.int32) self.learning_rate_p = self.placeholder("learning_rate_p", [1], tf.float32) self.clip_norm_p = self.placeholder("clip_norm_p", [1], tf.float32) self.dropout_keep_p = self.placeholder("dropout_keep_p", [1], tf.float32) self.batch_size, _ = tf.unstack(tf.shape(self.input_p)) if self.hyper_parameters.layers > 0: self.E = self.variable("E", [ len(self.case_field), len(self.input_field), self.hyper_parameters.width ]) self.E_bias = self.variable( "E_bias", [len(self.case_field), 1, self.hyper_parameters.width], 0.) self.Y = self.variable("Y", [ len(self.case_field), self.hyper_parameters.width, len(self.output_labels) ]) self.Y_bias = self.variable( "Y_bias", [len(self.case_field), 1, len(self.output_labels)], 0.) # The E layer is the first layer. if self.hyper_parameters.layers > 1: self.H = self.variable("H", [ len(self.case_field), self.hyper_parameters.layers - 1, self.hyper_parameters.width, self.hyper_parameters.width ]) self.H_bias = self.variable("H_bias", [ len(self.case_field), self.hyper_parameters.layers - 1, 1, self.hyper_parameters.width ], 0.) # Computational graph encoding cased_E = tf.nn.embedding_lookup(self.E, self.input_cases_p) mlbase.assert_shape(cased_E, [ batch_size_dimension, len(self.input_field), self.hyper_parameters.width ]) cased_E_bias = tf.nn.embedding_lookup(self.E_bias, self.input_cases_p) mlbase.assert_shape( cased_E_bias, [batch_size_dimension, 1, self.hyper_parameters.width]) self.embedded_input = tf.tanh( tf.matmul(tf.expand_dims(self.input_p, axis=1), cased_E) + cased_E_bias) mlbase.assert_shape( self.embedded_input, [batch_size_dimension, 1, self.hyper_parameters.width]) hidden = self.embedded_input mlbase.assert_shape( hidden, [batch_size_dimension, 1, self.hyper_parameters.width]) for l in range(self.hyper_parameters.layers - 1): cased_H = tf.nn.embedding_lookup(self.H, self.input_cases_p) mlbase.assert_shape(cased_H, [ batch_size_dimension, self.hyper_parameters.layers - 1, self.hyper_parameters.width, self.hyper_parameters.width ]) cased_H_bias = tf.nn.embedding_lookup(self.H_bias, self.input_cases_p) mlbase.assert_shape(cased_H_bias, [ batch_size_dimension, self.hyper_parameters.layers - 1, 1, self.hyper_parameters.width ]) hidden = tf.tanh( tf.matmul(self.dropout(hidden), cased_H[:, l]) + cased_H_bias[:, l]) mlbase.assert_shape( hidden, [batch_size_dimension, 1, self.hyper_parameters.width]) mlbase.assert_shape( hidden, [batch_size_dimension, 1, self.hyper_parameters.width]) cased_Y = tf.nn.embedding_lookup(self.Y, self.input_cases_p) mlbase.assert_shape(cased_Y, [ batch_size_dimension, self.hyper_parameters.width, len(self.output_labels) ]) cased_Y_bias = tf.nn.embedding_lookup(self.Y_bias, self.input_cases_p) mlbase.assert_shape( cased_Y_bias, [batch_size_dimension, 1, len(self.output_labels)]) else: self.Y = self.variable("Y", [ len(self.case_field), len(self.input_field), len(self.output_labels) ]) self.Y_bias = self.variable( "Y_bias", [len(self.case_field), 1, len(self.output_labels)], 0.) # Computational graph encoding hidden = tf.expand_dims(self.input_p, axis=1) mlbase.assert_shape( hidden, [batch_size_dimension, 1, len(self.input_field)]) cased_Y = tf.nn.embedding_lookup(self.Y, self.input_cases_p) mlbase.assert_shape(cased_Y, [ batch_size_dimension, len(self.input_field), len(self.output_labels) ]) cased_Y_bias = tf.nn.embedding_lookup(self.Y_bias, self.input_cases_p) mlbase.assert_shape( cased_Y_bias, [batch_size_dimension, 1, len(self.output_labels)]) cased_logit = tf.matmul(self.dropout(hidden), cased_Y) + cased_Y_bias mlbase.assert_shape(cased_logit, [batch_size_dimension, 1, len(self.output_labels)]) self.output_logit = tf.reshape(cased_logit, [-1, len(self.output_labels)]) mlbase.assert_shape(self.output_logit, [batch_size_dimension, len(self.output_labels)]) self.output_distributions = tf.nn.softmax(self.output_logit) mlbase.assert_shape(self.output_distributions, [batch_size_dimension, len(self.output_labels)]) #self.cost = tf.reduce_sum(tf.nn.nce_loss( # weights=tf.transpose(self.Y), # biases=self.Y_bias, # labels=self.output_p, # inputs=hidden, # num_sampled=1, # num_classes=len(self.output_labels))) loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits self.cost = tf.reduce_sum( loss_fn(labels=tf.stop_gradient(self.output_p), logits=self.output_logit)) #self.updates = tf.train.AdamOptimizer().minimize(self.cost) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_p[0]) gradients = optimizer.compute_gradients(self.cost) gradients_clipped = [(tf.clip_by_norm(g, self.clip_norm_p[0]), var) for g, var in gradients if g is not None] self.updates = optimizer.apply_gradients(gradients_clipped) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer())
def __init__(self, labels): super(MergeLabels, self).__init__(labels) self.labels = check.check_instance(labels, Labels)
def __init__(self, train, validate, test): self.train = check.check_instance(train, Datastream) self.validate = check.check_instance(validate, Datastream) self.test = check.check_instance(test, Datastream)
def find_matches(self, tolerance, first_only, predicates): check.check_instance(predicates, Predicates) # predicates: list of dicts, keyed by rnn part keys to lists of (axis, value) features # [ {(cell, 0): [ (0, 0.5), (22, -0.02), ... ] }, ... ] matched_activations = None matched_sequences = None for level, predicate in predicates.levels(): matches = None # Hit the _candidates query in order, to leverage the cached hit as much as possible. for key, features in sorted(predicate.items()): found_sequences = set() found_indices = {} first_feature = next(iter(features.items())) for sequence, index, *point in self._candidates( key, first_feature, tolerance, matched_sequences): point = tuple(point) candidate_point = [] target_point = [] operator_point = [] for axis, target_operator in features.items(): target, operator = target_operator candidate_point += [point[axis]] target_point += [target] operator_point += [operator] within = self._within(candidate_point, target_point, operator_point, tolerance) if within: found_sequences.add(sequence) if sequence not in found_indices: found_indices[sequence] = set() found_indices[sequence].add(index) if matched_sequences is None: matched_sequences = found_sequences logging.debug("initially matched sequences: %d" % len(matched_sequences)) else: matched_sequences.intersection_update(found_sequences) logging.debug("subsequently matched sequences: %d" % len(matched_sequences)) if matches is None: matches = found_indices else: next_matches = {} for sequence in matches.keys(): if sequence in found_indices: next_matches[sequence] = matches[ sequence].intersection(found_indices[sequence]) matches = next_matches if matched_activations is None: matched_activations = {} for sequence, indices in matches.items(): matched_activations[sequence] = [indices] else: removes = set() for sequence in matched_activations.keys(): if sequence in matches: matched_activations[sequence] += [matches[sequence]] else: removes.add(sequence) for remove in removes: del matched_activations[remove] if remove in matched_sequences: matched_sequences.remove(remove) matches = [] for sequence, requirements in matched_activations.items(): #logging.debug("searching for paths through: %s\n %s" % (" ".join(sequence), requirements)) paths = monotonic_paths(requirements, len(sequence), first_only) #logging.debug("found %d paths" % (len(paths))) for path in paths: matches += [(sequence, path)] return matches
def extract_parameters(self, training_parameters): check.check_instance(training_parameters, api.train.TrainingParameters) raise NotImplementedError()
def __init__(self, parameters, schedule): self.parameters = check.check_instance(parameters, TrainingParameters) self.schedule = check.check_instance(schedule, TrainingSchedule)
def __init__(self, model_dir, step, version_key): self.model_dir = check.check_instance(model_dir, str) self.step = check.check_gte(check.check_instance(step, int), 0) self.version_key = check.check_instance(version_key, str)
def to_dominant_inflection(self, lemma_term): self._finalize() check.check_instance(lemma_term, Term) return self.lemma_to_inflection[lemma_term]
def add_descendant(self, descendant): check.check_equal(self.finalized, False) check.check_instance(descendant, Node) check.check_not_equal(self.identifier, descendant.identifier) self.descendants.add(descendant)