def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) if shared.config['General'].getboolean('supervised'): full_graph.remove_isolated_nodes() # full_graph.load_edges_from_file(graph_file) # count rule frequencies in the full graph # rule_freq = defaultdict(lambda: 0) # for edge in full_graph.iter_edges(): # rule_freq[edge.rule] += 1 # initialize a PointModel logging.getLogger('main').info('Initializing the model...') model = ModelSuite(rule_set, lexicon=lexicon) # model = PointModel() # model.fit_rootdist(lexicon.entries()) # model.fit_ruledist(rule for (rule, domsize) in rules) # for rule, domsize in rules: # model.add_rule(rule, domsize, freq=rule_freq[rule]) softem(full_graph, model)
def chinese_whispers(graph: FullGraph, weights: np.ndarray, threshold: float = 0, root_weights: bool = False, max_iterations: int = None) -> List[List[LexiconEntry]]: def _weight(edge): return weights[graph.edge_set.get_id(edge)] # initialize node_cluster = np.empty(graph.number_of_nodes()) for node in graph: i = graph.lexicon.get_id(node) node_cluster[i] = i # run the clustering changed = 0 iter_num = 0 nodes_list = list(graph.nodes_iter()) while True: iter_num += 1 if max_iterations is not None and max_iterations > 0 and \ iter_num > max_iterations: break changed = 0 logging.getLogger('main').info('Iteration {}'.format(iter_num)) random.shuffle(nodes_list) for node in tqdm.tqdm(nodes_list): n_id = graph.lexicon.get_id(node) cluster_scores = {node_cluster[n_id]: 0.0} for edge in graph.ingoing_edges(node): if _weight(edge) > threshold: src_id = graph.lexicon.get_id(edge.source) if node_cluster[src_id] not in cluster_scores: cluster_scores[node_cluster[src_id]] = 0 cluster_scores[node_cluster[src_id]] += _weight(edge) if root_weights: root_weight = 1 - sum(cluster_scores.values()) cluster_scores[node_cluster[n_id]] += root_weight cluster_id = max(cluster_scores.items(), key=itemgetter(1))[0] if cluster_id != node_cluster[n_id]: node_cluster[n_id] = cluster_id changed += 1 logging.getLogger('main').info('changed nodes: {}'.format(changed)) if changed == 0: break # retrieve the clusters clusters_hash = {} for i, cl_id in enumerate(node_cluster): if cl_id not in clusters_hash: clusters_hash[cl_id] = [] clusters_hash[cl_id].append(graph.lexicon[i]) return sorted(list(clusters_hash.values()))
def __init__(self, full_graph: FullGraph, model: ModelSuite, tagset: List[Tuple[str]], warmup_iter: int = 1000, sampling_iter: int = 100000, iter_stat_interval: int = 1, min_subtree_prob=1e-100): self.tagset = tagset logging.getLogger('main').debug('tagset = {}'.format(str(tagset))) self.tag_idx = {tag: i for i, tag in enumerate(tagset)} self.min_subtree_prob = min_subtree_prob untagged_edge_set, self.edge_tr_mat = \ self._compute_untagged_edges_and_transition_mat(full_graph, model) untagged_full_graph = FullGraph(full_graph.lexicon, untagged_edge_set) super().__init__(untagged_full_graph, model, warmup_iter=warmup_iter, sampling_iter=sampling_iter, iter_stat_interval=iter_stat_interval) self._compute_root_prob() self._fast_compute_leaf_prob() self.init_forward_prob() self.init_backward_prob() self.write_debug_info()
def initialize(self, graph :FullGraph) -> None: '''Fit the models assuming unit weights for all roots and edges''' root_weights = None if shared.config['General'].getboolean('supervised'): # supervised learning -- take only nodes with no incoming edges # as roots root_weights = np.array([1 if not graph.predecessors(node) else 0 \ for node in graph.lexicon]) else: root_weights = np.ones(len(graph.lexicon)) edge_weights = np.ones(len(graph.edge_set)) self.root_model.fit(graph.lexicon, root_weights) if self.rule_model is not None: self.rule_model.fit(self.rule_set) self.fit(graph.lexicon, graph.edge_set, root_weights, edge_weights)
def load_graph(filename, lexicon, threshold=0.0): edge_set = EdgeSet(lexicon) weights = [] rules = {} for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename): try: edge_freq = float(edge_freq_str) if edge_freq < threshold: continue if rule_str not in rules: rules[rule_str] = Rule.from_string(rule_str) edge = GraphEdge(lexicon[word_1], lexicon[word_2], rules[rule_str], weight=edge_freq) edge_set.add(edge) weights.append(edge_freq) except ValueError: pass return FullGraph(lexicon, edge_set), np.array(weights)
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) # initialize a ModelSuite and save it logging.getLogger('main').info('Initializing the model...') model = ModelSuite(rule_set, lexicon=lexicon) model.initialize(full_graph) logging.getLogger('main').info('Saving the model...') model.save()
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rule_set = RuleSet.load(shared.filenames['rules']) logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(shared.filenames['graph'], lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) logging.getLogger('main').info('Initializing the model...') model = ModelSuite(rule_set, lexicon=lexicon) model.initialize(full_graph) deleted_rules = set() for iter_num in range(shared.config['modsel'].getint('iterations')): sampler = MCMCGraphSampler( full_graph, model, shared.config['modsel'].getint('warmup_iterations'), shared.config['modsel'].getint('sampling_iterations')) sampler.add_stat('acc_rate', AcceptanceRateStatistic(sampler)) sampler.add_stat('edge_freq', EdgeFrequencyStatistic(sampler)) sampler.add_stat('exp_cost', ExpectedCostStatistic(sampler)) sampler.run_sampling() # fit the model edge_weights = sampler.stats['edge_freq'].value() root_weights = np.ones(len(full_graph.lexicon)) for idx in range(edge_weights.shape[0]): root_id = \ full_graph.lexicon.get_id(full_graph.edge_set[idx].target) root_weights[root_id] -= edge_weights[idx] model.fit(sampler.lexicon, sampler.edge_set, root_weights, edge_weights) # compute the rule statistics freq, contrib = sampler.compute_rule_stats() # determine the rules to delete deleted_rules |= set(np.where(contrib < 0)[0]) logging.getLogger('main').info(\ '{} rules deleted.'.format(len(deleted_rules))) # delete the edges with selected rules from the graph edges_to_delete = [] for edge in full_graph.edges_iter(): if model.rule_set.get_id(edge.rule) in deleted_rules: edges_to_delete.append(edge) full_graph.remove_edges(edges_to_delete) # deleting the rules is not necessary -- instead, save the reduced # rule set at the end; fitting will be performed separately logging.getLogger('main').info('Saving the graph...') full_graph.edge_set.save(shared.filenames['graph-modsel']) # remove the deleted rules from the rule set and save it logging.getLogger('main').info('Saving the rule set...') new_rule_set = RuleSet() for i, rule in enumerate(rule_set): if i not in deleted_rules: new_rule_set.add(rule, rule_set.get_domsize(rule)) new_rule_set.save(shared.filenames['rules-modsel'])
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) # initialize a ModelSuite logging.getLogger('main').info('Loading the model...') model = ModelSuite.load() # setup the sampler logging.getLogger('main').info('Setting up the sampler...') sampler = MCMCGraphSamplerFactory.new( full_graph, model, warmup_iter=shared.config['sample'].getint('warmup_iterations'), sampling_iter=shared.config['sample'].getint('sampling_iterations'), iter_stat_interval=shared.config['sample'].getint( 'iter_stat_interval'), depth_cost=shared.config['Models'].getfloat('depth_cost')) if shared.config['sample'].getboolean('stat_cost'): sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler)) if shared.config['sample'].getboolean('stat_acc_rate'): sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler)) if shared.config['sample'].getboolean('stat_iter_cost'): sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler)) if shared.config['sample'].getboolean('stat_edge_freq'): sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_undirected_edge_freq'): sampler.add_stat('undirected_edge_freq', stats.UndirectedEdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_freq'): sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_contrib'): sampler.add_stat('contrib', stats.RuleExpectedContributionStatistic(sampler)) # run sampling and print results logging.getLogger('main').info('Running sampling...') sampler.run_sampling() sampler.summary() sampler.save_root_costs('sample-root-costs.txt') sampler.save_edge_costs('sample-edge-costs.txt') # save paths to a file pathlen = 0 with open_to_write('paths.txt') as fp: for entry in lexicon: root = sampler.branching.root(entry) path = sampler.branching.path(root, entry) path.reverse() size = sampler.branching.subtree_size(root) fp.write(' <- '.join([str(e) for e in path]) + \ ' ({}, {})\n'.format(len(path), size)) pathlen += len(path) logging.getLogger('main').debug('Average path length: {}'\ .format(pathlen / len(lexicon))) # save rule frequency model fits to a file if model.edge_frequency_model == 'lognormal': with open_to_write('freqmodel.txt') as fp: for r_id, rule in enumerate(model.rule_set): write_line(fp, (rule, model.edge_frequency_model.means[r_id], model.edge_frequency_model.sdevs[r_id])) # count words at each depth in the graph counts_per_depth = defaultdict(lambda: 0) queue = [(word, 0) for word in lexicon \ if sampler.branching.parent(word) is None] while queue: (word, d) = queue.pop() counts_per_depth[d] += 1 queue.extend([(word, d+1) \ for word in sampler.branching.successors(word)]) logging.getLogger('main').debug('Number of nodes per depth:') for d, c in counts_per_depth.items(): logging.getLogger('main').debug('{} {}'.format(d, c))