Exemple #1
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)
    if shared.config['General'].getboolean('supervised'):
        full_graph.remove_isolated_nodes()
#     full_graph.load_edges_from_file(graph_file)

# count rule frequencies in the full graph
#     rule_freq = defaultdict(lambda: 0)
#     for edge in full_graph.iter_edges():
#         rule_freq[edge.rule] += 1

# initialize a PointModel
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    #     model = PointModel()
    #     model.fit_rootdist(lexicon.entries())
    #     model.fit_ruledist(rule for (rule, domsize) in rules)
    #     for rule, domsize in rules:
    #         model.add_rule(rule, domsize, freq=rule_freq[rule])

    softem(full_graph, model)
Exemple #2
0
def chinese_whispers(graph: FullGraph,
                     weights: np.ndarray,
                     threshold: float = 0,
                     root_weights: bool = False,
                     max_iterations: int = None) -> List[List[LexiconEntry]]:
    def _weight(edge):
        return weights[graph.edge_set.get_id(edge)]

    # initialize
    node_cluster = np.empty(graph.number_of_nodes())
    for node in graph:
        i = graph.lexicon.get_id(node)
        node_cluster[i] = i
    # run the clustering
    changed = 0
    iter_num = 0
    nodes_list = list(graph.nodes_iter())
    while True:
        iter_num += 1
        if max_iterations is not None and max_iterations > 0 and \
                iter_num > max_iterations:
            break
        changed = 0
        logging.getLogger('main').info('Iteration {}'.format(iter_num))
        random.shuffle(nodes_list)
        for node in tqdm.tqdm(nodes_list):
            n_id = graph.lexicon.get_id(node)
            cluster_scores = {node_cluster[n_id]: 0.0}
            for edge in graph.ingoing_edges(node):
                if _weight(edge) > threshold:
                    src_id = graph.lexicon.get_id(edge.source)
                    if node_cluster[src_id] not in cluster_scores:
                        cluster_scores[node_cluster[src_id]] = 0
                    cluster_scores[node_cluster[src_id]] += _weight(edge)
            if root_weights:
                root_weight = 1 - sum(cluster_scores.values())
                cluster_scores[node_cluster[n_id]] += root_weight
            cluster_id = max(cluster_scores.items(), key=itemgetter(1))[0]
            if cluster_id != node_cluster[n_id]:
                node_cluster[n_id] = cluster_id
                changed += 1
        logging.getLogger('main').info('changed nodes: {}'.format(changed))
        if changed == 0:
            break
    # retrieve the clusters
    clusters_hash = {}
    for i, cl_id in enumerate(node_cluster):
        if cl_id not in clusters_hash:
            clusters_hash[cl_id] = []
        clusters_hash[cl_id].append(graph.lexicon[i])
    return sorted(list(clusters_hash.values()))
Exemple #3
0
 def __init__(self,
              full_graph: FullGraph,
              model: ModelSuite,
              tagset: List[Tuple[str]],
              warmup_iter: int = 1000,
              sampling_iter: int = 100000,
              iter_stat_interval: int = 1,
              min_subtree_prob=1e-100):
     self.tagset = tagset
     logging.getLogger('main').debug('tagset = {}'.format(str(tagset)))
     self.tag_idx = {tag: i for i, tag in enumerate(tagset)}
     self.min_subtree_prob = min_subtree_prob
     untagged_edge_set, self.edge_tr_mat = \
         self._compute_untagged_edges_and_transition_mat(full_graph, model)
     untagged_full_graph = FullGraph(full_graph.lexicon, untagged_edge_set)
     super().__init__(untagged_full_graph,
                      model,
                      warmup_iter=warmup_iter,
                      sampling_iter=sampling_iter,
                      iter_stat_interval=iter_stat_interval)
     self._compute_root_prob()
     self._fast_compute_leaf_prob()
     self.init_forward_prob()
     self.init_backward_prob()
     self.write_debug_info()
Exemple #4
0
 def initialize(self, graph :FullGraph) -> None:
     '''Fit the models assuming unit weights for all roots and edges'''
     root_weights = None
     if shared.config['General'].getboolean('supervised'):
         # supervised learning -- take only nodes with no incoming edges
         # as roots
         root_weights = np.array([1 if not graph.predecessors(node) else 0 \
                                  for node in graph.lexicon])
     else:
         root_weights = np.ones(len(graph.lexicon))
     edge_weights = np.ones(len(graph.edge_set))
     self.root_model.fit(graph.lexicon, root_weights)
     if self.rule_model is not None:
         self.rule_model.fit(self.rule_set)
     self.fit(graph.lexicon, graph.edge_set, root_weights, edge_weights)
Exemple #5
0
def load_graph(filename, lexicon, threshold=0.0):
    edge_set = EdgeSet(lexicon)
    weights = []
    rules = {}
    for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename):
        try:
            edge_freq = float(edge_freq_str)
            if edge_freq < threshold:
                continue
            if rule_str not in rules:
                rules[rule_str] = Rule.from_string(rule_str)
            edge = GraphEdge(lexicon[word_1],
                             lexicon[word_2],
                             rules[rule_str],
                             weight=edge_freq)
            edge_set.add(edge)
            weights.append(edge_freq)
        except ValueError:
            pass
    return FullGraph(lexicon, edge_set), np.array(weights)
Exemple #6
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite and save it
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    logging.getLogger('main').info('Saving the model...')
    model.save()
Exemple #7
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rule_set = RuleSet.load(shared.filenames['rules'])

    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(shared.filenames['graph'], lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    deleted_rules = set()

    for iter_num in range(shared.config['modsel'].getint('iterations')):
        sampler = MCMCGraphSampler(
            full_graph, model,
            shared.config['modsel'].getint('warmup_iterations'),
            shared.config['modsel'].getint('sampling_iterations'))
        sampler.add_stat('acc_rate', AcceptanceRateStatistic(sampler))
        sampler.add_stat('edge_freq', EdgeFrequencyStatistic(sampler))
        sampler.add_stat('exp_cost', ExpectedCostStatistic(sampler))
        sampler.run_sampling()

        # fit the model
        edge_weights = sampler.stats['edge_freq'].value()
        root_weights = np.ones(len(full_graph.lexicon))
        for idx in range(edge_weights.shape[0]):
            root_id = \
                full_graph.lexicon.get_id(full_graph.edge_set[idx].target)
            root_weights[root_id] -= edge_weights[idx]
        model.fit(sampler.lexicon, sampler.edge_set, root_weights,
                  edge_weights)

        # compute the rule statistics
        freq, contrib = sampler.compute_rule_stats()

        # determine the rules to delete
        deleted_rules |= set(np.where(contrib < 0)[0])
        logging.getLogger('main').info(\
            '{} rules deleted.'.format(len(deleted_rules)))

        # delete the edges with selected rules from the graph
        edges_to_delete = []
        for edge in full_graph.edges_iter():
            if model.rule_set.get_id(edge.rule) in deleted_rules:
                edges_to_delete.append(edge)
        full_graph.remove_edges(edges_to_delete)

        # deleting the rules is not necessary -- instead, save the reduced
        # rule set at the end; fitting will be performed separately

    logging.getLogger('main').info('Saving the graph...')
    full_graph.edge_set.save(shared.filenames['graph-modsel'])

    # remove the deleted rules from the rule set and save it
    logging.getLogger('main').info('Saving the rule set...')
    new_rule_set = RuleSet()
    for i, rule in enumerate(rule_set):
        if i not in deleted_rules:
            new_rule_set.add(rule, rule_set.get_domsize(rule))
    new_rule_set.save(shared.filenames['rules-modsel'])
Exemple #8
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite
    logging.getLogger('main').info('Loading the model...')
    model = ModelSuite.load()

    # setup the sampler
    logging.getLogger('main').info('Setting up the sampler...')
    sampler = MCMCGraphSamplerFactory.new(
        full_graph,
        model,
        warmup_iter=shared.config['sample'].getint('warmup_iterations'),
        sampling_iter=shared.config['sample'].getint('sampling_iterations'),
        iter_stat_interval=shared.config['sample'].getint(
            'iter_stat_interval'),
        depth_cost=shared.config['Models'].getfloat('depth_cost'))
    if shared.config['sample'].getboolean('stat_cost'):
        sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler))
    if shared.config['sample'].getboolean('stat_acc_rate'):
        sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler))
    if shared.config['sample'].getboolean('stat_iter_cost'):
        sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler))
    if shared.config['sample'].getboolean('stat_edge_freq'):
        sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_undirected_edge_freq'):
        sampler.add_stat('undirected_edge_freq',
                         stats.UndirectedEdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_freq'):
        sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_contrib'):
        sampler.add_stat('contrib',
                         stats.RuleExpectedContributionStatistic(sampler))

    # run sampling and print results
    logging.getLogger('main').info('Running sampling...')
    sampler.run_sampling()
    sampler.summary()

    sampler.save_root_costs('sample-root-costs.txt')
    sampler.save_edge_costs('sample-edge-costs.txt')

    # save paths to a file
    pathlen = 0
    with open_to_write('paths.txt') as fp:
        for entry in lexicon:
            root = sampler.branching.root(entry)
            path = sampler.branching.path(root, entry)
            path.reverse()
            size = sampler.branching.subtree_size(root)
            fp.write(' <- '.join([str(e) for e in path]) + \
                     ' ({}, {})\n'.format(len(path), size))
            pathlen += len(path)
    logging.getLogger('main').debug('Average path length: {}'\
                                    .format(pathlen / len(lexicon)))

    # save rule frequency model fits to a file
    if model.edge_frequency_model == 'lognormal':
        with open_to_write('freqmodel.txt') as fp:
            for r_id, rule in enumerate(model.rule_set):
                write_line(fp, (rule, model.edge_frequency_model.means[r_id],
                                model.edge_frequency_model.sdevs[r_id]))

    # count words at each depth in the graph
    counts_per_depth = defaultdict(lambda: 0)
    queue = [(word, 0) for word in lexicon \
                       if sampler.branching.parent(word) is None]
    while queue:
        (word, d) = queue.pop()
        counts_per_depth[d] += 1
        queue.extend([(word, d+1) \
                      for word in sampler.branching.successors(word)])
    logging.getLogger('main').debug('Number of nodes per depth:')
    for d, c in counts_per_depth.items():
        logging.getLogger('main').debug('{} {}'.format(d, c))