Exemple #1
0
 def load(filename: str, lexicon: Lexicon, rule_set: RuleSet) -> 'EdgeSet':
     result = EdgeSet(lexicon)
     edge_iter = (GraphEdge(lexicon[source],
                            lexicon[target],
                            rule_set[rule]) \
                  for source, target, rule in read_tsv_file(filename))
     return EdgeSet(lexicon, edge_iter)
Exemple #2
0
 def load(filename: str, rule_set: RuleSet) -> 'SimpleEdgeModel':
     result = SimpleEdgeModel(rule_set)
     probs = np.zeros(len(rule_set))
     for rule, prob in read_tsv_file(filename, (str, float)):
         r_id = rule_set.get_id(rule_set[rule])
         probs[r_id] = prob
     result.set_probs(probs)
     return result
Exemple #3
0
def contract_graph(graph_file: str) -> None:
    '''Remove any additional information needed for filtering.'''
    with open_to_write(graph_file + '.tmp') as graph_tmp_fp:
        logging.getLogger('main').info('Contracting the graph...')
        for w1, w2, rule, freq in read_tsv_file(graph_file,
                                                show_progressbar=True):
            write_line(graph_tmp_fp, (w1, w2, rule))
    rename_file(graph_file + '.tmp', graph_file)
Exemple #4
0
    def load(filename: str) -> 'Lexicon':

        def _parse_entry_from_row(row :List[str], use_restr=False,
                                  use_freq=False, use_vec=False, vec_sep=' ',
                                  vec_dim=None)\
                                 -> LexiconEntry:
            my_row = list(row)  # copy because it will be destroyed
            word = my_row.pop(0)
            kwargs = {}
            if use_restr:
                restr = my_row.pop(0).strip()
                kwargs['is_possible_edge_source'] = 'L' in restr
                kwargs['is_possible_edge_target'] = 'R' in restr
            if use_freq:
                kwargs['freq'] = int(my_row.pop(0).strip())
            if use_vec:
                vec_str = my_row.pop(0).strip()
                kwargs['vec'] = \
                    np.array(list(map(float, vec_str.split(vec_sep))))
                if kwargs['vec'] is None:
                    raise Exception("%s vec=None" % word)
                if kwargs['vec'].shape[0] != vec_dim:
                    raise Exception("%s dim=%d" % \
                                    (word, kwargs['vec'].shape[0]))
            return LexiconEntry(word, **kwargs)

        lexicon = Lexicon()
        # determine the file format
        use_restr = \
            shared.config['General'].getboolean('use_edge_restrictions')
        use_freq = \
            shared.config['Models'].get('root_frequency_model') != 'none' or \
            shared.config['Models'].get('edge_frequency_model') != 'none'
        use_vec = \
            shared.config['Models'].get('root_feature_model') != 'none' or \
            shared.config['Models'].get('edge_feature_model') != 'none'
        supervised = shared.config['General'].getboolean('supervised')
        vec_sep = shared.format['vector_sep']
        vec_dim = shared.config['Features'].getint('word_vec_dim')
        kwargs = {
            'use_restr': use_restr,
            'use_freq': use_freq,
            'use_vec': use_vec,
            'vec_dim': vec_dim
        }
        items_to_add = []
        for row in read_tsv_file(filename):
            try:
                if supervised:
                    row.pop(0)  # the first item is the base/lemma -> ignore
                entry = _parse_entry_from_row(row, **kwargs)
                items_to_add.append(entry)
            except Exception as e:
                #                 raise e
                logging.getLogger('main').warning('ignoring %s: %s' %\
                                                  (row[0], str(e)))
        lexicon.add(items_to_add)
        return lexicon
Exemple #5
0
def load_raw_vocabulary(filename: str) -> Lexicon:
    lexicon = Lexicon()
    for (word, ) in read_tsv_file(filename):
        try:
            lexicon.add(LexiconEntry(word))
        except Exception as e:
            logging.getLogger('main').warning('ignoring %s: %s' %\
                                              (word, str(e)))
    return lexicon
Exemple #6
0
 def load(filename: str) -> 'SimpleTagModel':
     result = SimpleTagModel()
     for tag_str, prob in read_tsv_file(filename, (str, float)):
         if tag_str:
             tag = tuple(shared.compiled_patterns['tag'].findall(tag_str))
             result.probs[tag] = prob
         else:
             result.smoothing_prob = prob
     return result
Exemple #7
0
 def load(filename :str) -> 'UnigramRuleModel':
     result = UnigramRuleModel()
     result.probs = {}
     for row in read_tsv_file(filename):
         if len(row) == 2:
             result.probs[row[0]] = float(row[1])
         elif len(row) == 3:
             result.probs[(row[0], row[1])] = float(row[2])
         else:
             logging.getLogger('main').warning(\
                 'Cannot parse row: {} in {}'\
                 .format(str(row), filename))
Exemple #8
0
def build_graph_from_training_edges(lexicon, training_file, graph_file):
    with open_to_write(graph_file) as fp:
        for word_1, word_2 in read_tsv_file(training_file, (str, str)):
            if word_1:
                try:
                    n1, n2 = lexicon[word_1], lexicon[word_2]
                    for rule in extract_all_rules(n1, n2):
                        write_line(fp, (str(n1), str(n2), str(rule)))
                except KeyError:
                    if word_1 not in lexicon:
                        logging.getLogger('main').warning('%s not in lexicon' %
                                                          word_1)
Exemple #9
0
def load_rules() -> List[Tuple[Rule, float]]:
    rules_filename = None
    if shared.config['compile'].getboolean('weighted'):
        if shared.config['Models'].get('edge_model') == 'simple':
            rules_filename = shared.filenames['edge-model']
            max_cost = None \
                       if shared.config['compile'].get('max_cost') == 'none' \
                       else shared.config['compile'].getfloat('max_cost')
            rules = [(Rule.from_string(rule), -math.log(prod))\
                     for rule, prod in\
                         read_tsv_file(rules_filename, (str, float))\
                     if max_cost is None or -math.log(prod) < max_cost ] +\
                    [(Rule.from_string(':/:___:'), 0.0)]
            return rules
        else:
            raise Exception('Compiling a weighted analyzer is only possible'
                            ' for the Bernoulli edge model.')
    else:
        rules_filename = shared.filenames['rules-modsel']
        if not file_exists(rules_filename):
            rules_filename = shared.filenames['rules']
        return [(Rule.from_string(rule), 0.0)\
                for (rule,) in read_tsv_file(rules_filename, (str,))] +\
               [(Rule.from_string(':/:___:'), 0.0)]
Exemple #10
0
def load_graph(filename, lexicon, threshold=0.0):
    edge_set = EdgeSet(lexicon)
    weights = []
    rules = {}
    for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename):
        try:
            edge_freq = float(edge_freq_str)
            if edge_freq < threshold:
                continue
            if rule_str not in rules:
                rules[rule_str] = Rule.from_string(rule_str)
            edge = GraphEdge(lexicon[word_1],
                             lexicon[word_2],
                             rules[rule_str],
                             weight=edge_freq)
            edge_set.add(edge)
            weights.append(edge_freq)
        except ValueError:
            pass
    return FullGraph(lexicon, edge_set), np.array(weights)
Exemple #11
0
def run():
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    lexicon_tr = FST.load_transducer(shared.filenames['lexicon-tr'])
    rules_tr = FST.load_transducer(shared.filenames['rules-tr'])
    rules_tr.convert(hfst.ImplementationType.HFST_OLW_TYPE)
    alphabet = lexicon_tr.get_alphabet()
    model = ModelSuite.load()
    max_results = shared.config['inflect'].getint('max_results')

    if shared.options['interactive']:
        for line in sys.stdin:
            try:
                lemma_str, tag = line.rstrip().split()
                lemma = LexiconEntry(lemma_str)
                for analysis in inflect_word(lemma,
                                             tag,
                                             rules_tr,
                                             model,
                                             max_results=max_results):
                    print(*analysis, sep='\t')
            except Exception as e:
                logging.getLogger('main').warning(e)
    else:
        pairs = []
        # FIXME is there a better solution for creating lists of LexiconEntry
        # objects and skipping the ones for which exceptions are thrown?
        for lemma, tag in read_tsv_file(shared.filenames['analyze.wordlist']):
            try:
                pairs.append((LexiconEntry(lemma), tag))
            except Exception as e:
                logging.warning(e)
        for lemma, tag in tqdm.tqdm(pairs):
            for analysis in inflect_word(lemma,
                                         tag,
                                         rules_tr,
                                         model,
                                         max_results=max_results):
                print(*analysis, sep='\t')
Exemple #12
0
 def load(filename: str) -> 'NGramFeatureExtractor':
     result = NGramFeatureExtractor()
     result.ngrams = [ngram for (ngram, ) in read_tsv_file(filename)]
     result.feature_idx = \
         { ngram: i for i, ngram in enumerate(result.ngrams) }
     return result
Exemple #13
0
 def load(filename :str) -> 'UnigramRootModel':
     result = UnigramRootModel()
     for sym, prob in read_tsv_file(filename, types=(str, float)):
         result.probs[sym] = prob
     return result
Exemple #14
0
 def root_reader():
     col = 0
     for row in read_tsv_file(shared.filenames['wordlist']):
         if col < len(row) and row[col]:
             yield row[col]
Exemple #15
0
def load_normalized_wordlist(filename: str) -> Iterable[str]:
    results = []
    for (word, ) in read_tsv_file(filename):
        results.append(LexiconEntry(word).normalized)
    return results