Example #1
0
 def save(self, filename :str) -> None:
     with open_to_write(filename) as fp:
         for sym, prob in self.probs.items():
             line = (sym[0], sym[1], prob) \
                    if isinstance(sym, tuple) \
                    else (sym, prob)
             write_line(fp, line)
Example #2
0
def contract_graph(graph_file: str) -> None:
    '''Remove any additional information needed for filtering.'''
    with open_to_write(graph_file + '.tmp') as graph_tmp_fp:
        logging.getLogger('main').info('Contracting the graph...')
        for w1, w2, rule, freq in read_tsv_file(graph_file,
                                                show_progressbar=True):
            write_line(graph_tmp_fp, (w1, w2, rule))
    rename_file(graph_file + '.tmp', graph_file)
Example #3
0
 def save_wordpair_stats(self, filename):
     stats, stat_names = [], []
     for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)):
         if isinstance(stat, UnorderedWordPairStatistic):
             stat_names.append(stat_name)
             stats.append(stat)
     with open_to_write(filename) as fp:
         write_line(fp, ('word_1', 'word_2') + tuple(stat_names))
         for key in self.unordered_word_pair_index:
             write_line(fp, key +\
                            tuple([stat.value(key) for stat in stats]))
Example #4
0
def filter_min_rule_freq(graph_file: str) -> None:
    logging.getLogger('main').info('filter_min_rule_freq')
    min_rule_freq = shared.config['preprocess'].getint('min_rule_freq')
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        for (rule,
             freq), wordpairs in read_tsv_file_by_key(graph_file, (3, 4),
                                                      show_progressbar=True):
            if len(wordpairs) >= min_rule_freq:
                for word_1, word_2 in wordpairs:
                    write_line(graph_fil_fp, (word_1, word_2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)
Example #5
0
 def write_edge_tr_mat(self, filename):
     with open_to_write(filename) as fp:
         for e_id, edge in enumerate(self.full_graph.edge_set):
             tag_probs = []
             edge_tr_mat = self.edge_tr_mat[e_id]
             for (t1_id, t2_id), val in edge_tr_mat.todok().items():
                 tag_1 = self.tagset[t1_id]
                 tag_2 = self.tagset[t2_id]
                 tag_probs.append(
                     (''.join(tag_1), ''.join(tag_2), str(val)))
             write_line(fp, (str(edge), ' '.join([t1+':'+t2+':'+prob \
                                                  for t1, t2, prob in tag_probs])))
Example #6
0
def build_graph_from_training_edges(lexicon, training_file, graph_file):
    with open_to_write(graph_file) as fp:
        for word_1, word_2 in read_tsv_file(training_file, (str, str)):
            if word_1:
                try:
                    n1, n2 = lexicon[word_1], lexicon[word_2]
                    for rule in extract_all_rules(n1, n2):
                        write_line(fp, (str(n1), str(n2), str(rule)))
                except KeyError:
                    if word_1 not in lexicon:
                        logging.getLogger('main').warning('%s not in lexicon' %
                                                          word_1)
Example #7
0
 def save_edge_stats(self, filename):
     stats, stat_names = [], []
     for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)):
         if isinstance(stat, EdgeStatistic):
             stat_names.append(stat_name)
             stats.append(stat)
     with open_to_write(filename) as fp:
         write_line(fp, ('word_1', 'word_2', 'rule') + tuple(stat_names))
         for idx, edge in enumerate(self.edge_set):
             write_line(fp,
                        (str(edge.source), str(edge.target),
                         str(edge.rule)) + tuple([stat.val[idx]\
                                                  for stat in stats]))
Example #8
0
 def save_iter_stats(self, filename: str) -> None:
     stats, stat_names = [], []
     for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)):
         if isinstance(stat, IterationStatistic):
             stat_names.append(stat_name)
             stats.append(stat)
     with open_to_write(filename) as fp:
         write_line(fp, ('iter_num', ) + tuple(stat_names))
         for iter_num in range(self.iter_stat_interval,
                               self.sampling_iter + 1,
                               self.iter_stat_interval):
             write_line(fp, (str(iter_num),) + \
                            tuple([stat.value(iter_num) for stat in stats]))
Example #9
0
def filter_max_edges_per_wordpair(graph_file: str) -> None:
    logging.getLogger('main').info('filter_max_edges_per_wordpair')
    sort_file(graph_file, stable=True, key=(1, 2))
    max_edges_per_wordpair = \
        shared.config['preprocess'].getint('max_edges_per_wordpair')
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        for (word_1,
             word_2), edges in read_tsv_file_by_key(graph_file, (1, 2),
                                                    show_progressbar=True):
            for rule, freq in edges[:max_edges_per_wordpair]:
                write_line(graph_fil_fp, (word_1, word_2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    sort_file(graph_file, key=3)
    sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4)
    update_file_size(graph_file)
Example #10
0
def expand_graph(graph_file: str) -> None:
    '''Annotate graph with additional information needed for filtering:
       currently rule frequencies.'''
    min_freq = shared.config['preprocess'].getint('min_rule_freq')
    with open_to_write(graph_file + '.tmp') as graph_tmp_fp:
        logging.getLogger('main').info('Expanding the graph for filtering...')
        for rule, wordpairs in read_tsv_file_by_key(graph_file,
                                                    3,
                                                    show_progressbar=True):
            freq = len(wordpairs)
            if freq >= min_freq:
                for w1, w2 in wordpairs:
                    write_line(graph_tmp_fp, (w1, w2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)
Example #11
0
def filter_max_num_rules(graph_file: str) -> None:
    logging.getLogger('main').info('filter_max_num_rules')
    sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4)
    max_num_rules = shared.config['preprocess'].getint('max_num_rules')
    min_rule_freq = shared.config['preprocess'].getint('min_rule_freq')
    progressbar = tqdm.tqdm(total=max_num_rules)
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        num_rules = 0
        for key, wordpairs in read_tsv_file_by_key(graph_file, (3, 4)):
            rule, freq = key
            num_rules += 1
            progressbar.update()
            if int(freq) >= min_rule_freq:
                for wordpair in wordpairs:
                    w1, w2 = wordpair
                    write_line(graph_fil_fp, (w1, w2, rule, freq))
            if num_rules >= max_num_rules:
                break
    progressbar.close()
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)
Example #12
0
 def save(self, filename :str) -> None:
     with open_to_write(filename) as fp:
         for sym, prob in self.probs.items():
             write_line(fp, (sym, prob))
Example #13
0
 def write_leaf_prob(self, filename):
     with open_to_write(filename) as fp:
         for w_id, entry in enumerate(self.lexicon):
             tag_probs = [''.join(tag)+':'+str(self.leaf_prob[w_id,t_id]) \
                          for t_id, tag in enumerate(self.tagset)]
             write_line(fp, (str(entry), ' '.join(tag_probs)))
Example #14
0
 def save_edge_costs(self, filename):
     with open_to_write(filename) as fp:
         for i, edge in enumerate(self.edge_set):
             write_line(fp, (edge, self.edge_cost_cache[i]))
Example #15
0
 def save_root_costs(self, filename):
     with open_to_write(filename) as fp:
         for i, entry in enumerate(self.lexicon):
             write_line(fp, (entry, self.root_cost_cache[i]))
Example #16
0
 def save(self, filename: str) -> None:
     with open_to_write(filename) as fp:
         for edge in self.__iter__():
             write_line(fp, edge.to_tuple()[:3])
Example #17
0
 def save(self, filename: str) -> None:
     with open_to_write(filename) as fp:
         write_line(fp, ('', self.smoothing_prob))
         for tag, prob in self.probs.items():
             write_line(fp, (''.join(tag), prob))
Example #18
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite
    logging.getLogger('main').info('Loading the model...')
    model = ModelSuite.load()

    # setup the sampler
    logging.getLogger('main').info('Setting up the sampler...')
    sampler = MCMCGraphSamplerFactory.new(
        full_graph,
        model,
        warmup_iter=shared.config['sample'].getint('warmup_iterations'),
        sampling_iter=shared.config['sample'].getint('sampling_iterations'),
        iter_stat_interval=shared.config['sample'].getint(
            'iter_stat_interval'),
        depth_cost=shared.config['Models'].getfloat('depth_cost'))
    if shared.config['sample'].getboolean('stat_cost'):
        sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler))
    if shared.config['sample'].getboolean('stat_acc_rate'):
        sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler))
    if shared.config['sample'].getboolean('stat_iter_cost'):
        sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler))
    if shared.config['sample'].getboolean('stat_edge_freq'):
        sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_undirected_edge_freq'):
        sampler.add_stat('undirected_edge_freq',
                         stats.UndirectedEdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_freq'):
        sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_contrib'):
        sampler.add_stat('contrib',
                         stats.RuleExpectedContributionStatistic(sampler))

    # run sampling and print results
    logging.getLogger('main').info('Running sampling...')
    sampler.run_sampling()
    sampler.summary()

    sampler.save_root_costs('sample-root-costs.txt')
    sampler.save_edge_costs('sample-edge-costs.txt')

    # save paths to a file
    pathlen = 0
    with open_to_write('paths.txt') as fp:
        for entry in lexicon:
            root = sampler.branching.root(entry)
            path = sampler.branching.path(root, entry)
            path.reverse()
            size = sampler.branching.subtree_size(root)
            fp.write(' <- '.join([str(e) for e in path]) + \
                     ' ({}, {})\n'.format(len(path), size))
            pathlen += len(path)
    logging.getLogger('main').debug('Average path length: {}'\
                                    .format(pathlen / len(lexicon)))

    # save rule frequency model fits to a file
    if model.edge_frequency_model == 'lognormal':
        with open_to_write('freqmodel.txt') as fp:
            for r_id, rule in enumerate(model.rule_set):
                write_line(fp, (rule, model.edge_frequency_model.means[r_id],
                                model.edge_frequency_model.sdevs[r_id]))

    # count words at each depth in the graph
    counts_per_depth = defaultdict(lambda: 0)
    queue = [(word, 0) for word in lexicon \
                       if sampler.branching.parent(word) is None]
    while queue:
        (word, d) = queue.pop()
        counts_per_depth[d] += 1
        queue.extend([(word, d+1) \
                      for word in sampler.branching.successors(word)])
    logging.getLogger('main').debug('Number of nodes per depth:')
    for d, c in counts_per_depth.items():
        logging.getLogger('main').debug('{} {}'.format(d, c))
Example #19
0
 def save_rule_stats(self, filename):
     freq, contrib = self.compute_rule_stats()
     with open_to_write(filename) as fp:
         for r_id, rule in enumerate(self.model.rule_set):
             write_line(fp, (rule, freq[r_id], contrib[r_id]))