def some_stats(corpus, docid, termv): doc = corpus.get_doc(docid) print(docid, "vocabulary {}:{} == {:.2}".format(doc.unique_len, doc.total_len, doc.unique_len / doc.total_len)) # exclude terms which appear only in one document (names, twitter handles) termv = list(filter(lambda t: corpus.get_term(t).document_frequency > 1, termv)) # function: return 5 terms sorted by a key function bykeyfun = lambda kf: sorted(zip(map(kf, termv), termv), reverse=True)[:10] # key functions tf = lambda stem: corpus.get_term(stem).term_frequency(docid) idf = lambda stem: corpus.get_term(stem).inverse_document_frequency tfidf = lambda stem: tf(stem) * idf(stem) # table spec cols = sorted( { "tf": tf, "idf": idf, "df": lambda stem: corpus.get_term(stem).document_frequency, "cf": lambda stem: corpus.get_term(stem).corpus_frequency, "tf*idf": tfidf, }.items() ) # line format, heading hfmt = " | ".join(len(cols) * ["{:>6} {:<16}"]) fmt = " | ".join(len(cols) * ["{:>6.4g} {!s:<16.16}"]) print(hfmt.format(*util.flatten(zip([name for name, _ in cols], itertools.repeat("term"))))) # data coldata = [bykeyfun(kf) for _, kf in cols] for row in zip(*coldata): print(fmt.format(*util.flatten(row)))
def reduce_anno_s(tmpl, cls, mtd, s): curried_e = partial(reduce_anno_e, tmpl, cls, mtd) curried_s = partial(reduce_anno_s, tmpl, cls, mtd) if s.kind in [C.S.EXP, C.S.ASSERT, C.S.RETURN]: red_e = curried_e(s.e) if type(red_e) is list: return red_e else: s.e = red_e elif s.kind == C.S.ASSIGN: s.le = curried_e(s.le) s.re = curried_e(s.re) elif s.kind == C.S.IF: s.e = curried_e(s.e) s.t = util.flatten(map(curried_s, s.t)) s.f = util.flatten(map(curried_s, s.f)) elif s.kind in [C.S.WHILE, C.S.REPEAT]: s.e = curried_e(s.e) s.b = util.flatten(map(curried_s, s.b)) elif s.kind == C.S.FOR: s.i = curried_e(s.i) s.init = curried_e(s.init) s.b = util.flatten(map(curried_s, s.b)) return [s]
def getPlayerId(self, *args): """Get the id of the current player""" a = tuple(flatten(args)) if self.playerId is not None and len(a) == 0: return self.playerId else: return int(self.conn.sendReceive_flat("world.getPlayerId", flatten(args)))
def group_by_sender(messages): """[Email] -> {str: [str]} : Associate lowercased email sender with a list of words.""" wordssd = collections.defaultdict(list) for m in messages: words = util.flatten(map(str.split, m.lines)) wordssd[m.sender.lower()].append(words) return {sender: util.flatten(wordss) for sender, wordss in wordssd.items()}
def test_flatten(self): def assertEqualListOrTuple(actual, expected): assert isinstance(expected, (list, tuple,)), "Test logic error" self.assertIsInstance(actual, (list, tuple,)) self.assertSequenceEqual(actual, expected) assertEqualListOrTuple(util.flatten([]), []) assertEqualListOrTuple(util.flatten([[([])]]), []) assertEqualListOrTuple(util.flatten([[1,2],3,[[4]],[(5,[6,7],8)]]), [1,2,3,4,5,6,7,8])
def getFormulaIdsFromPars(pars, onlyTheorems): thmPars = None if onlyTheorems: thmPars = map(lambda x: x[1], filter(lambda par: re.search(r"thm", par[0]), pars.items())) else: thmPars = map(lambda x: x[1], pars.items()) formulaTokens = filter(lambda token : token[:5] == "<fid ", flatten(flatten(thmPars))) return map(lambda token: token[5:-1], formulaTokens)
def Intersections(pts, console): '''Returns a dictionary of Intersections with Connections, with strings as keys.''' intind = OriginalIntersections(pts) net = IntersectionsJoin(intind) trimmed = IntersectionsTrim(net) rejoined = IntersectionsJoin(trimmed) intersections = IntersectionsBuild(rejoined, pts) t0 = time.time() intsInRange = list(set(util.flatten(map(lambda a: a[0], OptimalDistance(intersections))))) #makes list of intersections within reasonable distance to start/end ultimate_trim = sorted(util.flatten(map(lambda a: intersections[a].references[0].references, intsInRange))) console.add('Intersections', error=': '+str(time.time()-t0)) return IntersectionsBuild(IntersectionsJoin(ultimate_trim), pts)
def make(filenames, nprocs, cut): '''Create time residual PDF for a set of data files. Note: you may wish to use a smaller number of nprocs than you have CPUs; this function will almost certainly be I/O-bound. :param filenames: list of RAT ROOT files containing data :param cut: A Cut instance with cuts to apply to data :param nprocs: number of parallel jobs to run ''' p = multiprocessing.Pool(nprocs) erf = ERF(cut=cut) res = np.array(list(util.flatten(p.map(erf, filenames)))) print print len(res), 'entries' h, e = np.histogram(res, bins=750, range=(cut.t[0],cut.t[1]), normed=True) pdf = np.array(zip(e,h)) print 'total events:', total_events.value print 'events reconstructed:', events_reconstructed.value print 'events passing cuts:', events_passing_cuts.value with open('event_counts.txt', 'a') as f: f.write('%s %s %s %i %i %i %i\n' % (str(cut.e), str(cut.r), str(cut.r), len(res), total_events.value, events_reconstructed.value, events_passing_cuts.value)) return pdf
def __init__(self, past, future, features = None): """Create a training pattern. Parameters: past -- past feature vectors as a tensor of shape [P, V] where P is past days and V is the vectors/day future -- future feature vectors as a tensor of [F, V] where F is future days and V is the vectors/day features -- a sequence of feature names to use where None means use all features """ # calculate training input from past features past_subfeatures = [[self._subfeatures(vector, features) for vector in vectors] for vectors in past] self._input = numpy.array( [list(util.flatten(vectors)) for vectors in past_subfeatures]) # calculate training output from future volatility future_returns = numpy.log1p( [[vector.ret for vector in vectors] for vectors in future]) self._output = numpy.std(future_returns, axis = 0, ddof = 1)\ * numpy.sqrt(252) # calculate past returns for forecasts self._past_returns = numpy.log1p( [[vector.ret for vector in vectors] for vectors in past])
def finalize(self): merged_clusters = [] for c1 in self.clusters.values(): existing = None for m in c1: for c2 in merged_clusters: if m in c2: existing = c2 break if existing is not None: break if existing is not None: print("Merging clusters (shouldn't happen very often.)") existing.update(c1) else: merged_clusters.append(set(c1)) merged_clusters = [list(c) for c in merged_clusters] all_mentions = util.flatten(merged_clusters) assert len(all_mentions) == len(set(all_mentions)) return { "doc_key": self.doc_key, "sentences": self.sentences, "speakers": self.speakers, "clusters": merged_clusters }
def parsToFeatureCounts(pars, onlyTheorems): thmPars = None if onlyTheorems: thmPars = map(lambda x: x[1], filter(lambda par: re.search(r"thm", par[0]), pars.items())) else: thmPars = map(lambda x: x[1], pars.items()) textTokenList = filter(lambda token: not(token[:5] == "<fid "), flatten(flatten(thmPars))) tokenCounts = {} for token in textTokenList: if token not in tokenCounts: tokenCounts[token] = 0 tokenCounts[token] = tokenCounts[token] + 1 return tokenCounts
def choose_multi_label(labels, lang_model): longest = util.argmax(labels, scorer=lambda ngram: len(ngram)) if len(longest) > 3: best = util.argmax(bigrams.trigrams(longest), lambda ng: lang_model.lidstone(ng)) best = (best,) elif len(longest) == 3: best = longest best = (best,) elif len(longest) <= 2: # this is kinda shitty set of them .. would rather want all possible skip n-grams (O(N^2) of them?) z = [(tuple(x),) for x in labels] + bigrams.bigrams(labels) + bigrams.trigrams(labels) assert z z = [x for x in z if len(util.flatten(x)) <= 3] # sum is too weird # lexicographic ordering of the top-ranked sublabels in the multilabel def scorer(ngrams): scores = [lang_model.lidstone(ng) for ng in ngrams] if len(scores) < 3: scores += [0]*(3 - len(scores)) scores.sort(reverse=True) # print "SCORE %-30s %s" % (scores, ngrams) return scores z.sort(key= scorer, reverse=True) # print "RANKING",z best = z[0] else: assert False return best
def smooth_hscroll(string, row, iterations, delay=0.2, font=default_FONT): """ scrolls string at given row """ bytes = list(flatten(map(lambda c: font[c] + [0x00], string))) for i in xrange(iterations): position(0, row) data(bytes[i:i+84]) time.sleep(delay)
def min_value(self): if self.min_scale_value: return self.min_scale_value data = map(itemgetter("data"), self.data) if self.stacked: data = self.get_cumulative_data() return min(flatten(data))
def __init__(self, fsinput, fsgrammar, table=None): """ Initialize and return the object. @param fsinput: The input feature structure @type fsinput: C{nltk.featstruct.FeatStruct} @param fsgrammar: The generation grammar @type fsgrammar: C{nltk.featstruct.FeatStruct} @param table: The feature value type table @type table: C{fstypes.FeatureTypeTable} """ import copy self.fsinput = fsinput self.fsgrammar = fsgrammar self.table = table self.lr = LinkResolver() self.gpr = GrammarPathResolver(copy.deepcopy(fsgrammar), table) self.grammar_paths = flatten(self.gpr.resolve(copy.deepcopy(fsgrammar))) # the type table has been passed in # assign types to the feature values if table: for i, path in enumerate(self.grammar_paths): path = assign_types(table, path) self.grammar_paths[i] = path
def featured_sources_by_category(category=None): q = Source.query(Source.featured_priority < 1) if category: q = q.filter(Source.categories == category) q = q.order(Source.featured_priority) sources = q.fetch(400) categories = util.unique_ordered_list(util.flatten(s.categories for s in sources)) if category and category not in categories: categories.append(category) category_order = {category: i for i, category in enumerate(["Newspapers", "Culture", "Politics", "Tech", "Humor", "Local", "Longform"])} categories.sort(key=lambda x: category_order.get(x, 99999)) sources_by_category = defaultdict(list) for source in sources: for category in source.categories: sources_by_category[category].append(source) max_items_per_category = 60 if category else 15 for category, items in sources_by_category.items(): sources_by_category[category] = items[:min(len(items), max_items_per_category)] category_jsons = [] for category in categories: category_jsons.append({"id": category, "name": category, "sources": [s.json() for s in sources_by_category[category]]}) return category_jsons
def tensorize_example(self, example, is_training, oov_counts=None): clusters = example["clusters"] gold_mentions = sorted(tuple(m) for m in util.flatten(clusters)) gold_mention_map = {m:i for i,m in enumerate(gold_mentions)} cluster_ids = np.zeros(len(gold_mentions)) for cluster_id, cluster in enumerate(clusters): for mention in cluster: cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id sentences = example["sentences"] num_words = sum(len(s) for s in sentences) speakers = util.flatten(example["speakers"]) assert num_words == len(speakers) max_sentence_length = max(len(s) for s in sentences) max_word_length = max(max(max(len(w) for w in s) for s in sentences), max(self.config["filter_widths"])) word_emb = np.zeros([len(sentences), max_sentence_length, self.embedding_size]) char_index = np.zeros([len(sentences), max_sentence_length, max_word_length]) text_len = np.array([len(s) for s in sentences]) for i, sentence in enumerate(sentences): for j, word in enumerate(sentence): current_dim = 0 for k, (d, (s,l)) in enumerate(zip(self.embedding_dicts, self.embedding_info)): if l: current_word = word.lower() else: current_word = word if oov_counts is not None and current_word not in d: oov_counts[k] += 1 word_emb[i, j, current_dim:current_dim + s] = util.normalize(d[current_word]) current_dim += s char_index[i, j, :len(word)] = [self.char_dict[c] for c in word] speaker_dict = { s:i for i,s in enumerate(set(speakers)) } speaker_ids = np.array([speaker_dict[s] for s in speakers]) doc_key = example["doc_key"] genre = self.genres[doc_key[:2]] gold_starts, gold_ends = self.tensorize_mentions(gold_mentions) if is_training and len(sentences) > self.config["max_training_sentences"]: return self.truncate_example(word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids) else: return word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids
def fallbackGetCuboid(self, getBlock, *args): (x0,y0,z0,x1,y1,z1) = map(lambda x:int(math.floor(float(x))), flatten(args)) out = [] for y in range(min(y0,y1),max(y0,y1)+1): for x in range(min(x0,x1),max(x0,x1)+1): for z in range(min(z0,z1),max(z0,z1)+1): out.append(getBlock(x,y,z)) return out
def create_tfidf_vector(self): count_vect = CountVectorizer() doc = map(lambda x: " ".join(flatten(x)) + " " + \ x[0], self.goal_actions_map.items()) X_train_counts = count_vect.fit_transform(doc) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) return X_train_tfidf
def reduce_anno(smpls, tmpl): for cls in util.flatten_classes(tmpl.classes, "inners"): for fld in cls.flds: reduce_anno_fld(smpls, tmpl, cls, fld) for mtd in cls.mtds: reduce_anno_mtd(smpls, tmpl, cls, mtd) red_s = map(partial(reduce_anno_s, tmpl, cls, mtd), mtd.body) mtd.body = util.flatten(red_s)
def test_helper2(): statement = GeneralizedOr(Symbol("a"), Symbol("a")) # need to manually set it to this as otherwise the constructor would flatten it automatically statement.args[0] = Or(And(Symbol("b"), Not(Symbol("c"))), And(Symbol("c"), Not(Symbol("b")))) new_statement, change = util.flatten(copy.deepcopy(statement)) assert_equal(new_statement, GeneralizedOr(Symbol("a"), And(Symbol("b"), Not(Symbol("c"))), And(Symbol("c"), Not(Symbol("b"))))) assert_true(change)
def get_date(self): self._parse_doc() for page_nr in xrange(len(self._pages)): page_text = '\n'.join([x for x in flatten(self._pages[page_nr]) if isinstance(x,unicode)]) m = re.search('\d{1,2}\.\d{1,2}\.\d{4}',page_text) try: return datetime.datetime.strptime(m.group(0),'%d.%m.%Y').date() except AttributeError: pass return None
def query_refinement(orig_q, topic): if topic.ngram == ("**EXTRAS**",): return None subquery = topic.label.replace("/ ","") if any(AllJunkLike.match(term) for term in util.flatten(topic.label_ngrams)): # then twitter phrase search will drop that token. at least emoticons. # so fallback to non-phrase search pass elif len(subquery.split()) > 1: subquery = '"%s"' % subquery return orig_q + " " + subquery
def stat3(datum=None): if not datum: datum = time.mktime(datetime.date.today().timetuple()) response.set_header('Access-Control-Allow-Origin', '*') response.content_type = 'application/json' return json.dumps( util.flatten(finalna_verzija.StatPresloNaDrugu( time.mktime(datetime.datetime.strptime(datum, "%m-%d-%Y").timetuple()) ,db) ))
def extract_request(self): path = self.request.path logging.debug("path = %s" % path) query = self.request.query_string specname = path.split('/')[1] logging.debug("specname = %s" % specname) d = urlparse.parse_qs(query) d = util.flatten(d) dd = parse_proper(d) return specname, dd
def node_roots(self, n): """Returns the set of roots which reach N.""" result = [] deps = self.dependencies(n) # Base case if len(deps) == 0: return [n] # Recursive case return util.flatten([self.node_roots(d) for d in deps])
def __bottom_up_search(messages, rules): """generate_text() helper method which performs recursive best-first-search :param messages: a set containing ``Message``s and/or ``ConstituentSet``s :type messages: ``set`` of ``Message``s or ``ConstituentSet``s :param rules: a list of ``Rule``s specifying relationships which can hold between the messages :type rules: ``list`` of ``Rule``s :return: a set containing one ``Message``, i.e. the first valid plan reached by best-first-search. returns None if no valid plan is found. :rtype: ``NoneType`` or a ``set`` of (``Message``s or ``ConstituentSet``s) """ if len(messages) == 1: return messages elif len(messages) < 1: raise Exception('Error: Input contains no messages.') else: try: options = [rule.get_options(messages) for rule in rules] except: raise Exception('ERROR: Rule {0} had trouble with these ' \ 'messages: {1}'.format(rule, messages)) options = flatten(options) options_list = [] for x, y, z in options: y.freeze() options_list.append( (x, y, z) ) if options_list == []: return None #sort all options by their score, beginning with the highest one sorted_options = sorted(options_list, key = lambda (x,y,z): x, reverse=True) for (score, rst_relation, removes) in sorted_options: """ rst_relation: a ConstituentSet (RST relation) that was generated by Rule.get_options() removes: a list containing those messages that are now part of 'rst_relation' and should therefore not be used again """ testSet = messages - set(removes) testSet = testSet.union(set([rst_relation])) # a set containing a ConstituentSet and one or more Messages that # haven't been integrated into a structure yet ret = __bottom_up_search(testSet, rules) if ret: return ret return None
def generate_respondents_summary(self): all_uids_by_option = [option.people.keys() for option in self.options] all_uids = util.flatten(all_uids_by_option) num_respondents = len(set(all_uids)) if num_respondents == 0: output = 'Nobody responded' elif num_respondents == 1: output = '1 person responded' else: output = '{} people responded'.format(num_respondents) return output
def test_import_similar_lobbyists(self): """Slightly different lobbyists are inserted into different rows.""" filings = list(lobbyists.parse_filings(util.testpath('lobbyists_slightly_different.xml'))) con = sqlite3.connect(':memory:') con = lobbyists.create_db(con) cur = con.cursor() self.failUnless(lobbyists.import_filings(cur, filings)) cur = con.cursor() cur.execute('SELECT id FROM lobbyist') lobbyers = util.flatten([x['lobbyists'] for x in filings if 'lobbyists' in x]) self.failUnlessEqual(len(cur.fetchall()), len(lobbyers))
def orig_func_wraper(msg, *args): # Take the callers name and snap it in two, result is log # level, e.g.: log_debug is DEBUG level. log_level = origfunc.__name__.split("_")[1] import log if getattr(log, "LOG_%s" % log_level.upper()) <= \ log.LOG_LEVEL_CURRENT: # flatten and stringify the positional params so we don't # tuple() a tuple or an array and end up with # weirdness. a = map(str, util.flatten(args)) print_log_msg(log_level, str(msg) % tuple(a))
def model_fn(features, labels, mode, params): x = tf.reshape(features, [-1, 99, 161, 1], name='input_incep8') x_norm = tf.layers.batch_normalization( x, training=mode == tf.estimator.ModeKeys.TRAIN, name='x_norm') if params['verbose_summary']: tf.summary.image('input', x) conv1 = tf.layers.conv2d(x_norm, filters=16, kernel_size=3, padding='same', activation=tf.nn.relu, name='conv1') conv1b = tf.layers.conv2d(conv1, filters=16, kernel_size=3, activation=tf.nn.relu, name='conv1b') pool1 = tf.layers.max_pooling2d(conv1b, pool_size=[2, 2], strides=2, name='pool1') if params['verbose_summary']: log_conv_kernel('conv1') log_conv_kernel('conv1b') tf.summary.image('pool1', pool1[:, :, :, 0:1]) incep2 = inception_block(pool1, t1x1=8, t3x3=8, t5x5=8, tmp=8, name='incep2') conv3 = tf.layers.conv2d(incep2, filters=32, kernel_size=3, padding='same', activation=tf.nn.relu, name='conv3') conv3b = tf.layers.conv2d(conv3, filters=32, kernel_size=3, activation=tf.nn.relu, name='conv3b') pool3 = tf.layers.max_pooling2d(conv3b, pool_size=[2, 2], strides=2, name='pool3') if params['verbose_summary']: log_conv_kernel('conv3') log_conv_kernel('conv3b') tf.summary.image('pool3', pool3[:, :, :, 0:1]) conv5 = tf.layers.conv2d(pool3, filters=64, kernel_size=3, padding='same', activation=tf.nn.relu, name='conv5') conv5b = tf.layers.conv2d(conv5, filters=64, kernel_size=3, activation=tf.nn.relu, name='conv5b') pool5 = tf.layers.max_pooling2d(conv5b, pool_size=[2, 2], strides=2, name='pool5') if params['verbose_summary']: log_conv_kernel('conv5') log_conv_kernel('conv5b') tf.summary.image('pool5', pool5[:, :, :, 0:1]) incep6 = inception_block(pool5, t1x1=32, t3x3=32, t5x5=32, tmp=32, name='incep6') conv7 = tf.layers.conv2d(incep6, filters=128, kernel_size=3, padding='same', activation=tf.nn.relu, name='conv7') conv7b = tf.layers.conv2d(conv7, filters=128, kernel_size=3, activation=tf.nn.relu, name='conv7b') pool7 = tf.layers.max_pooling2d(conv7b, pool_size=[2, 2], strides=2, name='pool7') if params['verbose_summary']: log_conv_kernel('conv7') log_conv_kernel('conv7b') tf.summary.image('pool7', pool7[:, :, :, 0:1]) incep8 = inception_block(pool7, t1x1=64, t3x3=64, t5x5=64, tmp=64, name='incep8') conv9 = tf.layers.conv2d(incep8, filters=256, kernel_size=3, padding='same', activation=tf.nn.relu, name='conv9') conv9b = tf.layers.conv2d(conv9, filters=256, kernel_size=3, activation=tf.nn.relu, name='conv9b') pool9 = tf.layers.max_pooling2d(conv9b, pool_size=[2, 2], strides=2, name='pool9') if params['verbose_summary']: log_conv_kernel('conv9') log_conv_kernel('conv9b') tf.summary.image('pool9', pool9[:, :, :, 0:1]) flat = flatten(pool9) dropout4 = tf.layers.dropout(flat, rate=params['dropout_rate'], training=mode == tf.estimator.ModeKeys.TRAIN, name='dropout4') dense4 = tf.layers.dense(dropout4, units=2048, activation=tf.nn.relu, name='dense4') logits = tf.layers.dense(dense4, units=params['num_classes'], name='logits') predictions = { 'classes': tf.argmax(logits, axis=1, name='prediction_classes'), 'probabilities': tf.nn.softmax(logits, name='prediction_softmax') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions={'predictions': predictions['probabilities']}) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=params['num_classes'], name='onehot_labels') loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) tf.summary.scalar('loss', loss) optimizer = tf.train.GradientDescentOptimizer( learning_rate=params['learning_rate']) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes']) } tf.summary.scalar('accuracy', eval_metric_ops['accuracy'][1]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def home(): commands = ['G28 X Y', 'G28 Z'] return flatten(commands)
def get(thing): commands = [lift(50), goto(thing), lower(), lift(43), lower(), lift()] return flatten(commands)
def get_inputs(script): inputs = deque([convert_to_ascii(cmd) for cmd in script]) inputs = deque(util.flatten(inputs)) return inputs
def find(smpls, what, cond): lst = map(op.methodcaller("find", what, cond), smpls) return set(util.flatten(lst))
def model_fn(features, labels, mode, params): x = tf.reshape(features, [-1, 125, 128, 1], name='input_flatv1') x_flat = tf.reshape(features, [-1, 16000]) x_norm = tf.layers.batch_normalization( x, training=mode == tf.estimator.ModeKeys.TRAIN, name='x_norm') if params['verbose_summary']: tf.summary.image('input', x) tf.summary.audio('input', x_flat, 16000) conv1 = tf.layers.conv2d(x_norm, filters=16, kernel_size=3, activation=tf.nn.relu, name='conv1') conv2 = tf.layers.conv2d(conv1, filters=32, kernel_size=3, activation=tf.nn.relu, name='conv2') conv3 = tf.layers.conv2d(conv2, filters=64, kernel_size=3, activation=tf.nn.relu, name='conv3') pool3 = tf.layers.max_pooling2d(conv3, pool_size=[2, 2], strides=2, name='pool3') if params['verbose_summary']: for i in range(1, 4): label = 'conv{}'.format(i) graph_utils.log_conv_kernel(label) tf.summary.image(label, tf.expand_dims(conv1[..., 0], -1)) tf.summary.image('pool3', pool3[:, :, :, 0:1]) conv4 = tf.layers.conv2d(pool3, filters=128, kernel_size=3, activation=tf.nn.relu, name='conv4') conv5 = tf.layers.conv2d(conv4, filters=256, kernel_size=3, activation=tf.nn.relu, name='conv5') conv6 = tf.layers.conv2d(conv5, filters=512, kernel_size=3, activation=tf.nn.relu, name='conv6') pool6 = tf.layers.max_pooling2d(conv6, pool_size=[2, 2], strides=2, name='pool6') if params['verbose_summary']: for i in range(4, 7): label = 'conv{}'.format(i) graph_utils.log_conv_kernel(label) tf.summary.image(label, tf.expand_dims(conv1[..., 0], -1)) tf.summary.image('pool6', pool6[:, :, :, 0:1]) conv7 = tf.layers.conv2d(pool6, filters=1024, kernel_size=3, activation=tf.nn.relu, name='conv7') conv8 = tf.layers.conv2d(conv7, filters=1024, kernel_size=5, activation=tf.nn.relu, name='conv8') conv9 = tf.layers.conv2d(conv8, filters=1024, kernel_size=7, activation=tf.nn.relu, name='conv9') pool9 = tf.layers.max_pooling2d(conv9, pool_size=[2, 2], strides=2, name='pool9') if params['verbose_summary']: for i in range(7, 10): label = 'conv{}'.format(i) graph_utils.log_conv_kernel(label) tf.summary.image(label, tf.expand_dims(conv1[..., 0], -1)) tf.summary.image('pool9', pool9[:, :, :, 0:1]) conv10 = tf.layers.conv2d(pool9, filters=512, kernel_size=1, activation=tf.nn.relu, name='conv10') conv11 = tf.layers.conv2d(conv10, filters=512, kernel_size=1, activation=tf.nn.relu, name='conv11') conv12 = tf.layers.conv2d(conv11, filters=512, kernel_size=1, activation=tf.nn.relu, name='conv12') flat = flatten(conv12) dense = tf.layers.dense(flat, units=1024, activation=tf.nn.relu, name='dense') dropout = tf.layers.dropout(dense, rate=params['dropout_rate'], training=mode == tf.estimator.ModeKeys.TRAIN, name='dropout') logits = tf.layers.dense(dropout, units=12, name='logits') predictions = { 'classes': tf.argmax(logits, axis=1, name='prediction_classes'), 'probabilities': tf.nn.softmax(logits, name='prediction_softmax') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions={'predictions': predictions['probabilities']}) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=12, name='onehot_labels') loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) tf.summary.scalar('loss', loss) optimizer = tf.train.GradientDescentOptimizer( learning_rate=params['learning_rate']) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes']) } tf.summary.scalar('accuracy', eval_metric_ops['accuracy'][1]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter, input_idx=0, output_idx=-1) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter, input_idx=0, output_idx=-1) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter, input_idx=0, output_idx=-1) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words if is_train: sentences_words_train = [w_obj[0] for w_obj in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] def parse_to_word_ids(sentences): return util.parse_to_word_ids(sentences, xp=xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=0) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=0) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) # if is_train: x_train = parse_to_word_ids(sentences_train) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) # elif is_test: # x_predict = parse_to_word_ids(sentences_predict) # x_char_predict = parse_to_char_ids(sentences_predict) # y_predict = parse_to_tag_ids(sentences_predict) x_dev = parse_to_word_ids(sentences_dev) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_test = parse_to_word_ids(sentences_test) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) cnt_train_unk = sum([xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=None, n_label=len(vocab_tags)) if args['word_emb_file']: # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) net.word_embed.W.data[word_ids] = word_vecs if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): data = [(x_data[i], x_char_data[i], y_data[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) return predict_lists, sum_loss if is_test: # predict model_filename = args['model_filename'] model_filename = save_dir + model_filename serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train predict_pairs, _ = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) for predict in predict_tags: predict = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)] print predict return False tmax = args['max_iter'] t = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev = eval_loop(x_dev, x_char_dev, y_dev) # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
def finalize(self): # finalized: segments, segment_subtoken_map # populate speakers from info subtoken_idx = 0 for segment in self.segment_info: speakers = [] for i, tok_info in enumerate(segment): if tok_info is None and (i == 0 or i == len(segment) - 1): speakers.append('[SPL]') elif tok_info is None: speakers.append(speakers[-1]) else: speakers.append(tok_info[9]) if tok_info[4] == 'PRP': self.pronouns.append(subtoken_idx) subtoken_idx += 1 self.speakers += [speakers] # populate sentence map # populate clusters first_subtoken_index = -1 for seg_idx, segment in enumerate(self.segment_info): speakers = [] for i, tok_info in enumerate(segment): first_subtoken_index += 1 coref = tok_info[-2] if tok_info is not None else '-' if coref != "-": last_subtoken_index = first_subtoken_index + tok_info[ -1] - 1 for part in coref.split("|"): if part[0] == "(": if part[-1] == ")": cluster_id = int(part[1:-1]) self.clusters[cluster_id].append( (first_subtoken_index, last_subtoken_index)) else: cluster_id = int(part[1:]) self.coref_stacks[cluster_id].append( first_subtoken_index) else: cluster_id = int(part[:-1]) start = self.coref_stacks[cluster_id].pop() self.clusters[cluster_id].append( (start, last_subtoken_index)) # merge clusters merged_clusters = [] for c1 in self.clusters.values(): existing = None for m in c1: for c2 in merged_clusters: if m in c2: existing = c2 break if existing is not None: break if existing is not None: print("Merging clusters (shouldn't happen very often.)") existing.update(c1) else: merged_clusters.append(set(c1)) merged_clusters = [list(c) for c in merged_clusters] all_mentions = util.flatten(merged_clusters) sentence_map = get_sentence_map(self.segments, self.sentence_end) subtoken_map = util.flatten(self.segment_subtoken_map) assert len(all_mentions) == len(set(all_mentions)) num_words = len(util.flatten(self.segments)) assert num_words == len(util.flatten(self.speakers)) assert num_words == len(subtoken_map), (num_words, len(subtoken_map)) assert num_words == len(sentence_map), (num_words, len(sentence_map)) return { "doc_key": self.doc_key, "sentences": self.segments, "speakers": self.speakers, "constituents": [], "ner": [], "clusters": merged_clusters, 'sentence_map': sentence_map, "subtoken_map": subtoken_map, 'pronouns': self.pronouns }
def goto(thing): if thing not in thing_locations: raise (Exception('I don\'t know where {} is'.format(thing))) location = thing_locations[thing] commands = ['G1 X{} Y{} F3600'.format(location['X'], location['Y'])] return flatten(commands)
def papers_file(self): for file_obj in util.flatten(self.input()): if 'paper' in file_obj.path: return file_obj
end = file_mapping[span[1]][1] temp += [[start,end]] bert_clusters += [temp] example['sentences'] = bert_tokenized example['clusters'] = bert_clusters # json.dump(example, fp) train += [example] with open('test_english.pickle', 'wb') as handle: pickle.dump(train, handle, protocol=pickle.HIGHEST_PROTOCOL) gold_mentions = sorted(tuple(m) for m in util.flatten(clusters)) gold_mention_map = {m:i for i,m in enumerate(gold_mentions)} bert_cluster = train[-1]['clusters'] bert_tokenized = train[-1]['sentences'] bert_tokens = [] for bert_token in bert_tokenized : bert_tokens += bert_token bert_mentions = sorted(tuple(m) for m in util.flatten(bert_cluster)) bert_gold_mention_map = {m:i for i,m in enumerate(bert_mentions)} for i in range(len(gold_mentions)): span = gold_mentions[i]
smpl_files.extend(util.get_files_from_path(arg, "txt")) reset() smpls = [] for fname in smpl_files: smpl = Sample(fname, lambda mname: mname.endswith("Event")) smpls.append(smpl) if opt.method: _decls = decls(smpls) for cname in _decls.keys(): mnames = ", ".join(list(_decls[cname])) print "{}: {}".format(cname, mnames) if opt.event: _evts = util.flatten(map(op.attrgetter("evts"), smpls)) for evt in _evts: print str(evt) if opt.obj: print "# max: {}\n".format(max_objs(smpls)) _objs = objs(smpls) for cname in _objs.keys(): instances = ", ".join(_objs[cname]) print "{}: {}".format(cname, instances) if not sum([opt.method, opt.event, opt.obj]): for smpl in smpls: print "Sample: {}".format(smpl.name) print str(smpl)
def tensorize_example(self, example, is_training): clusters = example["clusters"] gold_mentions = sorted(tuple(m) for m in util.flatten(clusters)) gold_mention_map = {m: i for i, m in enumerate(gold_mentions)} cluster_ids = np.zeros(len(gold_mentions)) for cluster_id, cluster in enumerate(clusters): for mention in cluster: cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id + 1 sentences = example["sentences"] num_words = sum(len(s) for s in sentences) speakers = example["speakers"] # assert num_words == len(speakers), (num_words, len(speakers)) speaker_dict = self.get_speaker_dict(util.flatten(speakers)) sentence_map = example['sentence_map'] max_sentence_length = self.max_segment_len text_len = np.array([len(s) for s in sentences]) input_ids, input_mask, speaker_ids = [], [], [] for i, (sentence, speaker) in enumerate(zip(sentences, speakers)): sent_input_ids = self.tokenizer.convert_tokens_to_ids(sentence) sent_input_mask = [1] * len(sent_input_ids) sent_speaker_ids = [speaker_dict.get(s, 3) for s in speaker] while len(sent_input_ids) < max_sentence_length: sent_input_ids.append(0) sent_input_mask.append(0) sent_speaker_ids.append(0) input_ids.append(sent_input_ids) speaker_ids.append(sent_speaker_ids) input_mask.append(sent_input_mask) input_ids = np.array(input_ids) input_mask = np.array(input_mask) speaker_ids = np.array(speaker_ids) assert num_words == np.sum(input_mask), (num_words, np.sum(input_mask)) doc_key = example["doc_key"] self.subtoken_maps[doc_key] = example.get("subtoken_map", None) self.gold[doc_key] = example["clusters"] genre = self.genres.get(doc_key[:2], 0) gold_starts, gold_ends = self.tensorize_mentions(gold_mentions) example_tensors = (input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) if is_training and len( sentences) > self.config["max_training_sentences"]: if self.config['single_example']: return self.truncate_example(*example_tensors) else: offsets = range(self.config['max_training_sentences'], len(sentences), self.config['max_training_sentences']) tensor_list = [ self.truncate_example(*(example_tensors + (offset, ))) for offset in offsets ] return tensor_list else: return example_tensors
def evt_sources(self): srcss = map(op.attrgetter("sources"), self.evts) return util.rm_dup(util.flatten(srcss))
def learn(self, transitions, rewards): # Prepare for learning self.updateDicts() attributes = ["X_pos", "Y_pos", "X_size", "Y_size", "Colour", "Shape", "Nothing", "Reward"] model_updated = False if transitions and rewards: att_list = range(REWARD + 1) elif transitions and not rewards: att_list = range(REWARD) elif not transitions and rewards: att_list = [REWARD] else: return # For each object attribute or reward for i in att_list: # print("**************************") # print("Learning schemas for " + attributes[i]) # print("**************************") remaining = dict(zip(self.data[i].keys(),[[] for key in self.data[i].keys()])) # For each attribute/reward value to be predicted for key in self.data[i].keys(): # If the maximum number of schemas has already been learn we skip this round of learning if len(self.schemas[i][key]) >= LIMIT: remaining[key] = self.data[i][key] continue # If we are predicting rewards the learning data is constructed from all objects that have changed if i == REWARD: # Form positive cases xYes = [] xNo = [] for datum in self.data[i][key]: predicted = False for o in datum.keys(): if self.checkDatum([datum[o], key], i, consistency_check=True): predicted = True # self.evidence[i][key].append(datum) break if not predicted: xYes += [datum[c] for c in self.obsChanges] xNo += [datum[o]for o in datum.keys() if o not in self.obsChanges] # if not self.checkDatum([datum[o], key], i)[0]: # xYes += [datum[c] for c in self.obsChanges] # xNo += [datum[o] for o in datum.keys() if o not in self.obsChanges] # Form negative cases for other in self.data[i].keys(): if other != key: xNo += util.flatten([[datum[o] for o in datum.keys()] for datum in self.data[i][other] + self.evidence[i][other]]) # Otherwise we construct learning data in the standard way else: # Form positive cases xYes = [] for datum in self.data[i][key]: if datum[0][i] != key: # if self.checkDatum([datum,key], i)[0]: # self.evidence[i][key].append(datum) # else: # xYes.append(datum) if not self.checkDatum([datum,key], i, consistency_check=True): xYes.append(datum) self.data[i][key] = [datum for datum in self.data[i][key] if datum not in self.evidence[i][key]] # Form negative cases xNo = [self.data[i][other] + self.evidence[i][other] for other in self.data[i].keys() if other != key] xNo = util.flatten(xNo) # If there are no changes in this attribute of the primary object then we skip this round of learning if len(xYes) == 0: remaining[key] = self.data[i][key] # print("no changes for " + str(key)) continue # Form binary vectors for learning xYes = [util.toBinary(self, item) for item in xYes] xNo = [util.toBinary(self, item) for item in xNo] schemas = [util.toBinarySchema(self, schema) for schema in self.schemas[i][key]] oldSchemas = deepcopy(schemas) # print("Learning for " + str(key)) # Learn and output schemas, new evidence, and remaining positive cases if i == REWARD: [binarySchemas, _, _] = lern.learnSchemas(xYes, xNo, schemas, self.deterministic) else: [binarySchemas, binaryEvidence, binaryRemaining] = lern.learnSchemas(xYes, xNo, schemas, self.deterministic) # print("111111111111111111") # print schemas # print("222222222222222222") # print binarySchemas # print("333333333333333333") # Name new schemas new_names = [] new_schemas = [util.fromBinarySchema(self, s, key) for s in binarySchemas if s not in oldSchemas] for s in new_schemas: s.name = self.num_schemas new_names.append(s.name) self.num_schemas += 1 # Convert learnt schemas and evidence from binary output and add to model self.schemas[i][key] += new_schemas self.schemas[i][key] = util.simplify(self, self.schemas[i][key], key, attributes[i]) # Get initial counts of and display new schemas new_printed = False for s in self.schemas[i][key]: if s.name in new_names: if not new_printed: print("New schemas: ") new_printed = True model_updated = True if not self.deterministic: s.get_initial_counts(self, i) print(attributes[i] + " = " + str(key) + " <- " + s.display(no_head=True)) # # If they are reward schemas then the binary evidence and remaining data are not in the correct form to be stored # if i == REWARD: # for datum in self.data[i][key]: # predicted = False # for o in datum.keys(): # if self.checkDatum([datum[o], key], i)[0]: # predicted = True # self.evidence[i][key].append(datum) # break # if not predicted: # remaining[key].append(datum) # # # Otherwise we can convert directly back from the binary data and store the resukt # else: # self.evidence[i][key] += [util.fromBinary(self, datum) for datum in binaryEvidence] # remaining[key] = [util.fromBinary(self, datum) for datum in binaryRemaining] # # self.data[i] = remaining return model_updated
docs = [json.loads(line) for line in lines] tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input(docs) predicted_clusters, _, _ = runner.predict(model, tensor_examples) if args.output_path: with open(args.output_path, 'w') as f: for i, doc in enumerate(docs): doc['predicted_clusters'] = predicted_clusters[i] f.write(json.dumps(doc) + "\n") #print(f'Saved prediction in {args.output_path}') else: # Interactive input model.to(model.device) nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) while True: input_str = str(input('Input document:')) bert_tokenizer, spacy_tokenizer = data_processor.tokenizer, nlp doc = get_document_from_string(input_str, args.seg_len, bert_tokenizer, nlp) tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input([doc]) predicted_clusters, _, _ = runner.predict(model, tensor_examples) subtokens = util.flatten(doc['sentences']) #print('---Predicted clusters:') for cluster in predicted_clusters[0]: mentions_str = [' '.join(subtokens[m[0]:m[1]+1]) for m in cluster] mentions_str = [m.replace(' ##', '') for m in mentions_str] mentions_str = [m.replace('##', '') for m in mentions_str] #print(mentions_str) # Print out strings # print(cluster) # Print out indices
else: model.restore(sess) for j, line in enumerate(lines[1:]): parts = line.split('\t') example_id = parts[0].strip() text = parts[1].strip() doc = nlp(unicode(text)) sentences = [[unicode(str(w)) for w in sent] for sent in doc.sents] example = { 'sentences': sentences, 'doc_key': 'nw', 'speakers': [['' for _ in sent] for sent in doc.sents], 'clusters': [] } result = make_predictions(text, model, sess, example) words = util.flatten(result['sentences']) c = 0 nameA = parts[4].strip() nameA_offset = int(parts[5].strip()) nameB = parts[7].strip() nameB_offset = int(parts[8].strip()) pronoun_char_offset = int(parts[3].strip()) pronoun_index = None nameA_index = None nameB_index = None for k, token in enumerate(doc): if token.idx == pronoun_char_offset or ( pronoun_index is None and k + 1 < len(doc) and doc[k + 1].idx > pronoun_char_offset): pronoun_index = token.i elif token.idx == nameA_offset or (
def detip(): commands = [lift(), goto('tipstop'), lift(109), lift()] return flatten(commands)
def push(): commands = ['G1 E45 F3600'] return flatten(commands)
def flam3_to_node(flame): n = util.unflatten(util.flatten(apply_structure(flame_structure, flame))) n['type'] = 'node' return n
def print_predictions(self, example): words = util.flatten(example["sentences"]) for cluster in example["predicted_clusters"]: print(u"Predicted cluster: {}".format( [" ".join(words[m[0]:m[1] + 1]) for m in cluster]))
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 # 64-bit doesn't help much, search for 64-bit in # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4 u.default_dtype = dtype machine_epsilon = np.finfo(dtype).eps # 1e-7 or 1e-16 train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte') dsize = 10000 patches = train_images[:, :dsize] fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0f = W_uniform(fs[2], fs[3]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: util.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") if purely_linear: # need lower LR without sigmoids lr = init_var(.02, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): if not purely_linear: return tf.sigmoid(x) else: return tf.identity(x) def d_sigmoid(y): if not purely_linear: return y * (1 - y) else: return 1 def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run # tf.assert always fails because of static assert # fail_node = tf.assert_equal(1, 0, message="too huge") fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] if i == 1 and not drop_sparsity: backprop += beta * d_kl(rho, rho_hat) backprop2 += beta * d_kl(rho, rho_hat) B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] if use_tikhonov: whitened_B2 = u.regularized_inverse2(vars_svd_B2[i], L=Lambda) @ B[i] else: whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i] pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) loss = reconstruction if not drop_l2: loss = loss + L2 if not drop_sparsity: loss = loss + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient pre_grad_stable_live = u.flatten( pre_dW_stable[1:]) # sqrt fisher preconditioned grad grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable") update_params_op = Wf.assign(Wf - lr * pre_grad).op update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) pre_grad_stable_norm = u.L2(pre_grad_stable) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) util.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) util.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): if whitening_mode > 1: vars_svd_A[2].update() if whitening_mode > 2: vars_svd_B2[2].update() if whitening_mode > 3: vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() # tf.get_default_graph().finalize() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) #sess = tf.Session(config=config) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] ratios = [] # actual loss decrease / expected decrease grad_norms = [] pre_grad_norms = [] # preconditioned grad norm squared pre_grad_stable_norms = [] # sqrt preconditioned grad norms squared target_delta_list = [] # predicted decrease linear approximation target_delta2_list = [] # predicted decrease quadratic appromation actual_delta_list = [] # actual decrease # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning if whitening_mode > 0: vars_svd_A[1].update() # compute t(delta).H.delta/2 def hessian_quadratic(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l]) total += decrement return (total / 2).eval() # compute t(delta).H^-1.delta/2 def hessian_quadratic_inv(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): invB2 = u.pseudo_inverse2(vars_svd_B2[l]) invA = u.pseudo_inverse2(vars_svd_A[l]) decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA) total += decrement return (total / 2).eval() # do line search, dump values as csv def line_search(initial_value, direction, step, num_steps): saved_val = tf.Variable(Wf) sess.run(saved_val.initializer) pl = tf.placeholder(dtype, shape=(), name="linesearch_p") assign_op = Wf.assign(initial_value - direction * step * pl) vals = [] for i in range(num_steps): sess.run(assign_op, feed_dict={pl: i}) vals.append(loss.eval()) sess.run(Wf.assign(saved_val)) # restore original value return vals for step in range(num_steps): update_covariances() if step % whiten_every_n_steps == 0: update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) save_params_op.run() # regular inverse becomes unstable when grad norm exceeds 1 stabilized_mode = grad_norm.eval() < 1 if stabilized_mode and not use_tikhonov: update_params_stable_op.run() else: update_params_op.run() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope target_delta_list.append(target_delta) # second order prediction of target delta # TODO: the sign is wrong, debug this # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28 if local_quadratics: x0 = Wf_copy.eval() x_opt = x0 - pre_grad.eval() # computes t(x)@H^-1 @(x)/2 y_opt = loss0 - hessian_quadratic_inv(grad) # computes t(x)@H @(x)/2 y_expected = hessian_quadratic(Wf - x_opt) + y_opt target_delta2 = y_expected - loss0 target_delta2_list.append(target_delta2) actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 actual_delta_list.append(actual_delta) if do_line_search: vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40) vals2 = line_search(Wf_copy, grad, lr / 100, 40) u.dump(vals1, "line1-%d" % (i, )) u.dump(vals2, "line2-%d" % (i, )) losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) grad_norms.append(grad_norm.eval()) pre_grad_norms.append(pre_grad_norm.eval()) pre_grad_stable_norms.append(pre_grad_stable_norm.eval()) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs( target_delta) > 1e-6 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print( "Slope optimality %.2f, shrinking learning rate to %.2f" % ( slope_ratio, lr0 * beta, )) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f" % (lr0 * growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") u.check_equal(targets, losses[:len(targets)], rtol=1e-1) u.summarize_time() print("Test passed")
pre_dW[i] = (whitened_B2 @ t(whitened_A))/dsize pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable))/dsize dW[i] = (B[i] @ t(A[i]))/dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) loss = reconstruction if not drop_l2: loss = loss + L2 if not drop_sparsity: loss = loss + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient pre_grad_stable_live = u.flatten(pre_dW_stable[1:]) # sqrt fisher preconditioned grad grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable") update_params_op = Wf.assign(Wf-lr*pre_grad).op update_params_stable_op = Wf.assign(Wf-lr*pre_grad_stable).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad*grad) pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad*grad) grad_norm = tf.reduce_sum(grad*grad) pre_grad_norm = u.L2(pre_grad) pre_grad_stable_norm = u.L2(pre_grad_stable)
def map_smpls(smpls, f): return util.flatten(map(f, smpls))
def lift(height=98): commands = ['G1 Z{} F3600'.format(height)] return flatten(commands)
def intFloor(*args): return [int(math.floor(x)) for x in flatten(args)]
def model_fn(features, labels, mode, params): training = mode == tf.estimator.ModeKeys.TRAIN x = tf.reshape(features, [-1, 125, 161, 2], name='cnn6') x_norm = tf.layers.batch_normalization(x, training=training, name='x_norm') x = tf.reshape(x_norm[:, :, :, 0], [-1, 125, 161, 1], name='reshape_spec') if params['verbose_summary']: tf.summary.image('input', x) conv = x conv = tf.layers.conv2d(conv, filters=16, kernel_size=5, activation=tf.nn.relu, name='conv1') pool = tf.layers.max_pooling2d(conv, pool_size=[2, 2], strides=2, name='pool1') if params['verbose_summary']: log_conv_kernel('conv1') tf.summary.image('pool1', pool[:, :, :, 0:1]) conv = tf.layers.conv2d(pool, filters=32, kernel_size=5, activation=tf.nn.relu, name='conv2') pool = tf.layers.max_pooling2d(conv, pool_size=[2, 2], strides=2, name='pool2') if params['verbose_summary']: log_conv_kernel('conv2') tf.summary.image('pool2', pool[:, :, :, 0:1]) conv = tf.layers.conv2d(pool, filters=64, kernel_size=5, activation=tf.nn.relu, name='conv3') pool = tf.layers.max_pooling2d(conv, pool_size=[2, 2], strides=2, name='pool3') if params['verbose_summary']: log_conv_kernel('conv3') tf.summary.image('pool3', pool[:, :, :, 0:1]) conv = tf.layers.conv2d(pool, filters=128, kernel_size=5, activation=tf.nn.relu, name='conv4') pool = tf.layers.max_pooling2d(conv, pool_size=[2, 2], strides=2, name='pool4') if params['verbose_summary']: log_conv_kernel('conv4') tf.summary.image('pool4', pool[:, :, :, 0:1]) flat = flatten(pool) dropout4 = tf.layers.dropout(flat, rate=params['dropout_rate'], training=training, name='dropout4') dense4 = tf.layers.dense(dropout4, units=128, activation=tf.nn.relu, name='dense4') logits = tf.layers.dense(dense4, units=params['num_classes'], name='logits') predictions = { 'classes': tf.argmax(logits, axis=1, name='prediction_classes'), 'probabilities': tf.nn.softmax(logits, name='prediction_probabilities'), } if mode == ModeKeys.PREDICT: return EstimatorSpec(mode=mode, predictions={'predictions': predictions['probabilities']}) tf.summary.image('confusion_matrix', conf_mat(labels, predictions['classes'], params['num_classes'])) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=params['num_classes'], name='onehot_labels') loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) tf.summary.scalar('loss', loss) optimizer = tf.train.GradientDescentOptimizer(learning_rate=params['learning_rate']) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes']) } tf.summary.scalar('accuracy', eval_metric_ops['accuracy'][1]) return EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops )
whitened_B2 = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i] pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Cost function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) cost = reconstruction if not drop_l2: cost = cost + L2 if not drop_sparsity: cost = cost + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2)
def getAllCategoryNames(self): return list( set(flatten([log.getAllCategoryNames() for log in self.list])))
def tensorize_example(self, example, is_training, oov_counts=None): clusters = example["clusters"] gold_mentions = sorted(tuple(m) for m in util.flatten(clusters)) gold_mention_map = {m: i for i, m in enumerate(gold_mentions)} cluster_ids = np.zeros(len(gold_mentions)) for cluster_id, cluster in enumerate(clusters): for mention in cluster: cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id sentences = example["sentences"] num_words = sum(len(s) for s in sentences) speakers = util.flatten(example["speakers"]) # add POS tag and NER pos_tags = example["pos_tags"] # if self.config["use_categories"]: categories = example["categories"] # categories if self.config["use_ner_g"] or self.config["use_ner_phi"]: ner_tags = example["ner_tags"] ner_ids = np.array([ self.ner_tag_dict[s[2:].replace('*', '')] if len(s) > 1 else self.ner_tag_dict[s] for s in util.flatten(ner_tags) ]) else: ner_ids = np.array([]) assert num_words == len(speakers) max_sentence_length = max(len(s) for s in sentences) max_word_length = max(max(max(len(w) for w in s) for s in sentences), max(self.config["filter_widths"])) word_emb = np.zeros( [len(sentences), max_sentence_length, self.embedding_size]) char_index = np.zeros( [len(sentences), max_sentence_length, max_word_length]) text_len = np.array([len(s) for s in sentences]) pos_tag_emb = np.zeros( [len(sentences), max_sentence_length, len(self.pos_tag_dict)]) ner_tag_emb = np.zeros( [len(sentences), max_sentence_length, len(self.ner_tag_dict)]) categories_emb = np.zeros( [len(sentences), max_sentence_length, len(self.categories_dict)]) for i, sentence in enumerate(sentences): for j, word in enumerate(sentence): current_dim = 0 # word embedding with glove # k is index, 0 or 1 # d is embedding dict, turian or glove # s is size (either 300 or 50) # l is lowercase = true or false (usually false) for k, (d, (s, l)) in enumerate( zip(self.embedding_dicts, self.embedding_info)): # print "s", s # print "k", k if l: current_word = word.lower() else: current_word = word if oov_counts is not None and current_word not in d: oov_counts[k] += 1 word_emb[i, j, current_dim:current_dim + s] = util.normalize(d[current_word]) current_dim += s # character embedding char_index[i, j, :len(word)] = [self.char_dict[c] for c in word] # one hot encoding pos_tag_emb[i, j, :] = np.zeros([len(self.pos_tag_dict)]) one = self.pos_tag_dict.get(pos_tags[i][j], 0) pos_tag_emb[i, j, one] = 1 if self.config["use_ner_g"]: ner_tag_emb[i, j, :] = np.zeros([len(self.ner_tag_dict)]) one = self.ner_tag_dict.get(ner_tags[i][j], 0) ner_tag_emb[i, j, one] = 1 if self.config["use_categories"]: categories_emb[i, j, :] = np.zeros( [len(self.categories_dict)]) one = self.categories_dict.get(categories[i][j], 0) categories_emb[i, j, one] = 1 # print type(self.glove_embedding_dict) # print self.glove_embedding_dict cat_glove_emb = np.zeros( [len(sentences), max_sentence_length, self.glove_embedding_size]) # print len(self.glove_embedding_dict) if self.config["use_categories_glove"]: for i, category in enumerate(categories): for j, cat in enumerate(category): # current_dim = 0 # for d in self.glove_embedding_dict: if cat != '-': # print cat cat_glove_emb[i, j, :] = util.normalize( self.glove_embedding_dict[cat]) # current_dim += 300 speaker_dict = {s: i for i, s in enumerate(set(speakers))} speaker_ids = np.array([speaker_dict[s] for s in speakers]) doc_key = example["doc_key"] genre = self.genres[doc_key[:2]] gold_starts, gold_ends = self.tensorize_mentions(gold_mentions) if is_training and len( sentences) > self.config["max_training_sentences"]: return self.truncate_example(word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, pos_tag_emb, ner_tag_emb, categories_emb, ner_ids, cat_glove_emb) # return self.truncate_example(word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, pos_tag_emb, ner_tag_emb, ner_ids) else: return word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, pos_tag_emb, ner_tag_emb, categories_emb, ner_ids, cat_glove_emb