def _store_tree(root_node): """ Stores the reading alternation tree to the database. """ # Build our list of results. def iter_results(tree): for node in tree.walk(): yield (node.label, node.code, node.probability, node.left_visit, node.right_visit) return # Insert them to the database. cursor = connection.cursor() cursor.execute('DELETE FROM reading_alt_kanjireading') cursor.execute('DELETE FROM reading_alt_readingalternation') max_per_run = 10000 all_results = iter_results(root_node) for results in groups_of_n(max_per_run, all_results): cursor.executemany( """ INSERT INTO reading_alt_readingalternation (value, code, probability, left_visit, right_visit) VALUES (%s, %s, %s, %s, %s) """, results, ) cursor.close() return
def _store_kanji_readings(alt_tree): "Stores a separate table of only leaf-node readings." def iter_results(tree): for kanji_node in tree.children.itervalues(): kanji = kanji_node.label reading_map = {} for leaf_node in kanji_node.walk_leaves(): # Calculate the probability for this path. reading = leaf_node.label leaf_path = leaf_node.get_ancestors()[1:] pdf = math.exp(sum([n.probability for n in leaf_path])) codes = set([n.code for n in leaf_path]) if reading not in reading_map or \ reading_map[reading]['pdf'] < pdf: reading_map[reading] = {'pdf': pdf, 'codes': codes} if not reading_map: # No readings for this kanji continue total = sum(r['pdf'] for r in reading_map.itervalues()) cdf = 0.0 for reading, entry in reading_map.iteritems(): pdf = entry['pdf'] / total cdf += pdf yield (kanji, reading, ''.join(sorted(entry['codes'])), pdf, cdf, leaf_path[-1].left_visit) assert abs(cdf - 1.0) < 1e-8 return max_per_insert = 10000 all_results = iter_results(alt_tree) cursor = connection.cursor() cursor.execute('DELETE FROM reading_alt_kanjireading') quoted_fields = tuple( connection.ops.quote_name(f) for f in [ 'condition', 'symbol', 'alternations', 'pdf', 'cdf', 'reading_alternation_id' ]) for results in groups_of_n(max_per_insert, all_results): cursor.executemany( """ INSERT INTO reading_alt_kanjireading (%s) VALUES (%%s, %%s, %%s, %%s, %%s, %%s) """ % ', '.join(quoted_fields), results) cursor.execute('COMMIT') cursor.close() return
def _store_kanji_readings(alt_tree): "Stores a separate table of only leaf-node readings." def iter_results(tree): for kanji_node in tree.children.itervalues(): kanji = kanji_node.label reading_map = {} for leaf_node in kanji_node.walk_leaves(): # Calculate the probability for this path. reading = leaf_node.label leaf_path = leaf_node.get_ancestors()[1:] pdf = math.exp(sum([n.probability for n in leaf_path])) codes = set([n.code for n in leaf_path]) if reading not in reading_map or \ reading_map[reading]['pdf'] < pdf: reading_map[reading] = {'pdf': pdf, 'codes': codes} if not reading_map: # No readings for this kanji continue total = sum(r['pdf'] for r in reading_map.itervalues()) cdf = 0.0 for reading, entry in reading_map.iteritems(): pdf = entry['pdf'] / total cdf += pdf yield (kanji, reading, ''.join(sorted(entry['codes'])), pdf, cdf, leaf_path[-1].left_visit) assert abs(cdf - 1.0) < 1e-8 return max_per_insert = 10000 all_results = iter_results(alt_tree) cursor = connection.cursor() cursor.execute('DELETE FROM reading_alt_kanjireading') quoted_fields = tuple(connection.ops.quote_name(f) for f in ['condition', 'symbol', 'alternations', 'pdf', 'cdf', 'reading_alternation_id']) for results in groups_of_n(max_per_insert, all_results): cursor.executemany( """ INSERT INTO reading_alt_kanjireading (%s) VALUES (%%s, %%s, %%s, %%s, %%s, %%s) """ % ', '.join(quoted_fields), results ) cursor.execute('COMMIT') cursor.close() return
def from_dist(cls, prob_dist): table_name = cls._meta.db_table cursor = connection.cursor() cursor.execute('DELETE FROM %s' % table_name) rows = [] cdf = 0.0 for symbol in prob_dist.samples(): pdf = prob_dist.freq(symbol) cdf += pdf rows.append((symbol, pdf, cdf)) for row_set in groups_of_n(N_ROWS_PER_INSERT, rows): cursor.executemany( """ INSERT INTO `%s` (`symbol`, `pdf`, `cdf`) VALUES (%%s, %%s, %%s) """ % table_name, row_set) cursor.close()
def from_dist(cls, prob_dist): table_name = cls._meta.db_table cursor = connection.cursor() cursor.execute('DELETE FROM %s' % table_name) rows = [] cdf = 0.0 for symbol in prob_dist.samples(): pdf = prob_dist.freq(symbol) cdf += pdf rows.append((symbol, pdf, cdf)) for row_set in groups_of_n(N_ROWS_PER_INSERT, rows): cursor.executemany( """ INSERT INTO `%s` (`symbol`, `pdf`, `cdf`) VALUES (%%s, %%s, %%s) """ % table_name, row_set ) cursor.close()