コード例 #1
0
def learn_lexical_entries(input_dir, cat_path, dict_path, verbose='none', \
        parse_mode='given'):
    from IPython.display import display
    from src.utl.turtle import html_table
    from src.utl.read_files import check_dir, check_dir_files, check_corpus
    from src.utl.write_files import list2file
    from src.link_grammar.turtle import \
        files2disjuncts, lexical_entries, entries2clusters, entries2categories, \
        disjuncts2clusters, entries2rules, save_link_grammar

    if check_dir(input_dir, create=False, verbose=verbose):
        files = check_dir_files(input_dir, verbose=verbose)
        if len(files) > 0:
            if verbose == 'max': print(files)
            for i, file in enumerate(files):
                if check_corpus(file, verbose=verbose):
                    if verbose == 'max':
                        print('File #' + str(i), file, 'checked')
                else:
                    print('File #' + str(i), file, 'check failed')
        else:
            print('Input directory', input_dir, 'is empty')
    else:
        print('No input directory', input_dir)

    log = {'project': 'Grammar Learner -- Lexical entries'}  # use OR DEL?

    disjuncts = files2disjuncts(files, 'LEFT-WALL', True, verbose)
    #TODO: parse_mode?
    entries = lexical_entries(disjuncts)
    category_list = entries2categories(entries)
    if verbose == 'max':
        display(
            html_table(
                [['Parent', 'Category', 'Quality', 'Words', 'Relevance']] +
                category_list))
    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    if verbose == 'max':
        for line in categories.splitlines()[:3]:
            print(line)
        print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to',
              cat_file)

    lg_rule_list = entries2rules(disjuncts2clusters(entries2clusters(entries)))
    if verbose == 'max':
        display(
            html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] +
                       lg_rule_list))
    lg_rules_str = save_link_grammar(lg_rule_list, dict_path)
    if verbose == 'max':
        for line in lg_rules_str.splitlines():
            print(line)
    #-return categories, lg_rules_dict
    #TODO: return paths to categories and dict?
    s = lg_rules_str.splitlines()[-1]
    lg_file = s[s.find(': ') + 2:]
    response = {'categories_file': cat_file, 'grammar_file': lg_file}
    return response
コード例 #2
0
def save_cat_tree(cats, output_categories, verbose='none'):  #80706 0.5
    #80611 ~ cats2list without 'djs', children'...
    # cats: {'cluster':[], 'words':[], ...}                     #80609
    from copy import deepcopy
    from src.utl.write_files import list2file
    from src.utl.utl import UTC

    tree_file = output_categories
    if '.' not in tree_file:  #auto file name
        if tree_file[-1] != '/': tree_file += '/'
        #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt')
        n_cats = len(
            [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1])
        tree_file += (str(n_cats) + '_cat_tree.txt')

    categories = []
    for i, cluster in enumerate(cats['cluster']):
        if i == 0: continue
        category = []
        if cats['cluster'][i] is not None:
            category.append(cats['cluster'][i])
        else:
            category.append('')
        category.append(cats['parent'][i])
        category.append(i)
        category.append(round(cats['quality'][i], 2))
        #!category.append(sorted(cats['words'][i]))  #80704+06 tmp hack FIXME
        wordz = deepcopy(sorted(cats['words'][i]))
        #-80704 word@1, word@2 ⇒ word.a, word.b:
        #-wordz = [x.replace('@1','.a') for x in wordz]
        #-wordz = [x.replace('@2','.b') for x in wordz]
        #-wordz = [x.replace('@3','.c') for x in wordz]
        wordz = [x.replace('@', '.')
                 for x in wordz]  #80706 WSD: word@1 ⇒ word.1
        category.append(wordz)  #80704+06 tmp hack FIXME
        #80704+06 end
        category.append(cats['similarities'][i])
        #-category.append(cats['children'][i])
        categories.append(category)

    string = list2file(categories, tree_file)

    if verbose in ['max', 'debug']:
        print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \
            len(cats['cluster']) - 1, 'categories')
    if verbose == 'debug':
        from src.utl.widgets import html_table
        from IPython.display import display
        display(
            html_table(
                [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] +
                categories))

    return {'cat_tree_file': tree_file}
コード例 #3
0
def save_category_tree(category_list, tree_file, verbose='none'):  #80522
    import os
    cats = category_list
    clusters = {}
    m = 0
    for i, x in enumerate(cats):
        if x[0] not in clusters: clusters[x[0]] = []
        clusters[x[0]].append(i)
        if x[2] > m: m = x[2]
    tree = []
    for k, v in clusters.items():
        if len(v) == 1:
            tree.append(cats[v[0]])
        elif len(v) > 1:
            words = []
            similarities = []
            for j in v:
                words.extend(cats[j][4])
                similarities.extend(cats[j][5])
            tree.append(
                [cats[v[0]][0], 0, m + 1, cats[v[0]][3], words, similarities])
            for j in v:
                tree.append([
                    '', m + 1, cats[j][2], cats[j][3], cats[j][4], cats[j][5]
                ])
        else:
            print('WTF?', k, v)
    if verbose in ['max', 'debug']:
        from src.utl.widgets import html_table
        from IPython.display import display
        display(
            html_table(
                [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] +
                tree))

    from src.utl.write_files import list2file
    #-tree_file = os.path.dirname(cat_file) + '/cat_tree.txt'
    string = list2file(tree, tree_file)

    return {'tree_file': tree_file}
コード例 #4
0
def category_learner(links, \
    cat_path, dict_path, tmpath = '', verbose = 'none', \
    parse_mode = 'given', left_wall = '', period = False, \
    context = 1, window = 'mst', weighting = 'ppmi', group = True, \
    word_space = 'vectors', dim_max = 100, sv_min = 0.1,
    dim_reduction = 'svm', \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9): #, \
    #-generalization = 'off', merge = 0.8, aggregate = 0.2, grammar_rules = 1):

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc03 import number_of_clusters, clusters2list  #80422
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Grammar Learner v.0.3 2018-04-11', \
           'date': str(UTC()), 'project_dir': dict_path}
    '''TODO: log: dict ⇒ list [[]]? / OrderedDict?'''

    if tmpath == '': tmpath = dict_path  # temporary files path
    if verbose == 'debug':
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if word_space == 'vectors':
        #^from src.space.hyperwords import vector_space_dim, pmisvd
        #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        vdf, sv, res3 = pmisvd(links, tmpath, tmpath, dim)
        log.update(res3)
    elif verbose in ['max', 'debug']:
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if clustering == 'kmeans':
        #^from src.clustering.kmeans import cluster_words_kmeans
        #^from src.clustering.poc03 import number_of_clusters, clusters2list
        n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        if verbose not in ['min', 'none']:
            print('/poc03/category_learner: number of clusters =', n_clusters)
        if verbose in ['max', 'debug']: print(clusters.applymap(round2))
        if verbose in ['max', 'debug']:  #80412 hack: plots for AGI-2018 :(
            if context == 1:  #FIXME:DEL?
                plot2d(1, 2, clusters, 'cluster_words', 10)
            else:
                if len(clusters) < 6:
                    plot2d(1, 3, clusters, 'cluster_words', 10)
                else:
                    plot2d(1, 4, clusters, 'cluster_words', 10)

    elif clustering[:5] in ['group', 'ident']:
        if verbose in ['max', 'debug']: print('clustering:', clustering)
        #TODO: from src.clustering.grouping import group_links
        clusters = group_links(links, verbose)
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))
        if verbose in ['max', 'debug']:
            print('\n', clusters[['cluster_words', 'disjuncts']])  #.head(12))

    # Generalization  #TODO next week

    # Save categories

    #^from src.clustering.poc import clusters2list
    #^from src.utl.write_files import list2file
    category_list = clusters2list(clusters)
    if verbose not in ['min', 'none']:
        display(html_table([['Parent','Category','Quality','Words','Relevance']] \
            + category_list))
    '''TODO: categories file name'''

    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + str(len(clusters)) + '_categories.txt'
    categories = list2file(category_list, cat_file)
    '''TODO: category file path ⇒ log'''

    if verbose in ['debug']:
        print('\nWord categories:\n')
        for line in categories.splitlines()[:3]:
            print(line)
    if verbose not in ['min', 'none']:
        print('\nCategory list -', len(categories.splitlines()),
              'lines, saved to', cat_file)

    return category_list, log
コード例 #5
0
def learn_connectors(input_dir, cat_path, dict_path, verbose='none', \
    parse_mode='given', \
    word_space = 'hyperwords', dim_max = 100, sv_min = 0.1, \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = ''):

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.utl.read_files import check_mst_files
    from src.space.poc import files2links
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc import number_of_clusters, clusters2list
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Unified Grammar Learner: Clustering words', \
           'date': str(UTC()), 'project_dir': dict_path, 'input_dir': input_dir }
    """TODO: dict ⇒ list [[]] / OrderedDict?"""

    files, response = check_mst_files(input_dir, verbose='none')
    links = files2links(files, parse_mode='given', context=1, group = True, \
                        left_wall='LEFT-WALL', period=True, verbose='none')

    # vector_space_dim(links, path, tmpath, dim_max=100, sv_min=0.9, 'max')
    if tmpath == '': tmpath = dict_path
    dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
    log.update({'vector_space_dim': dim})
    if verbose not in ['none', 'min']:
        print('Optimal vector space dimensionality:', dim)

    vdf, sv, res2 = pmisvd(links, dict_path, tmpath, dim)
    log.update(res2)
    #-vdf.applymap(round2).sort_values(by=[1,2,3], ascending=[False,False,False])

    n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
        criteria=cluster_criteria, level=cluster_level, verbose=verbose)
    if verbose not in ['none', 'min']:
        print('Optimal number of clusters:', n_clusters)

    clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
    if verbose in ['max', 'debug']: plot2d(1, 2, clusters, 'cluster_words', 10)

    # Generalisation - just histogram? - Grammar-Learner-Clustering-Words 2.6
    import numpy as np
    from src.clustering.similarity import cluster_similarity
    sim_df, res3 = cluster_similarity(clusters, 'max')
    log.update(res3)
    if verbose in ['max', 'debug']:
        count, division = np.histogram(sim_df['similarity'])
        sim_df['similarity'].hist(bins=division)
        print('Cluster similarities: absolute values > 0.1:')
        sim_df.sort_values(by='similarity',
                           ascending=False).loc[(sim_df['similarity']) > 0.1]

    # Save categories
    category_list = clusters2list(clusters)
    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    if verbose in ['max', 'debug']:
        for line in categories.splitlines():
            print(line)
        print('<...>\nTotal', len(categories.splitlines()), \
              'lines, saved to', cat_file)
    #-print(len(categories.splitlines()), 'categories saved to', cat_file)

    # Grammar Learner
    lg_rule_list = grammar_learner(clusters, links, verbose)
    if verbose == 'max':
        display(
            html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] +
                       lg_rule_list))
    lg_rules_str = save_link_grammar(lg_rule_list, dict_path)
    if verbose == 'max':
        for line in lg_rules_str.splitlines():
            print(line)
    #-return categories, lg_rules_dict
    #TODO: return paths to categories and dict?
    s = lg_rules_str.splitlines()[-1]
    lg_file = s[s.find(': ') + 2:]
    response = {'categories_file': cat_file, 'grammar_file': lg_file}
    return response
コード例 #6
0
def category_learner(links, \
    cat_path, dict_path, verbose='none', \
    parse_mode='given', \
    word_space = 'vectors', dim_max = 100, sv_min = 0.1, \
    dim_reduction = 'svm', \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = '',
    generalization = 'off',
    grammar_rules = 'connectors'):  # no actual need need for grammar rules here?

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc import number_of_clusters, clusters2list
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Grammar Learner v.0.2 2018-04-06', \
           'date': str(UTC()), 'project_dir': dict_path}
    '''TODO: log: dict ⇒ list [[]]? / OrderedDict?'''

    if word_space == 'vectors':
        if tmpath == '': tmpath = dict_path
        #^from src.space.hyperwords import vector_space_dim, pmisvd
        dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min,
                               verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        log.update(res3)
    else:
        #TODO: word_space = 'discrete'...
        if tmpath == '': tmpath = dict_path
        dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min,
                               verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        log.update(res3)

    # Clustering
    #-clustering = 'group'

    if clustering == 'kmeans':
        #^from src.clustering.poc import number_of_clusters, clusters2list
        n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        if verbose not in ['min', 'none']:
            print('Optimal number of clusters:', n_clusters)
        if verbose == 'max': plot2d(1, 2, clusters, 'cluster_words', 10)

    elif clustering[:5] in ['group', 'ident']:  #80606 test ~OK

        def group_links(links):
            #+TODO: old code ⇒ here  ⇒ src.clustering.group_links.py
            #-Old way:
            from src.link_grammar.turtle import lexical_entries, entries2clusters
            djs = links.rename(columns={'link': 'disjunct'})
            #-clusters = entries2clusters(lexical_entries(djs))
            entries = lexical_entries(djs)
            clusters = entries2clusters(entries).rename(
                columns={'germs': 'cluster_words'})
            return clusters

        #+from ... import group links
        clusters = group_links(links)
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))
        if verbose == 'max':
            print('\n', clusters[['cluster_words', 'disjuncts']])  #.head(12))

    # Generalization  #TODO next week

    # Save categories

    #^from src.clustering.poc import clusters2list
    #^from src.utl.write_files import list2file
    category_list = clusters2list(clusters)
    if verbose not in ['min', 'none']:
        display(html_table([['Parent','Category','Quality','Words','Relevance']] \
            + category_list))
    '''TODO: categories file name'''

    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    '''TODO: category file path ⇒ log'''

    if verbose == 'max':
        for line in categories.splitlines()[:3]:
            print(line)
    if verbose != 'none':
        print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to',
              cat_file)

    return category_list, log
コード例 #7
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    print('learn_grammar: grammar_rules:', grammar_rules)

    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os, pickle  #, collections
    from collections import OrderedDict
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc05 import files2links  #80528 .poc05
    from src.clustering.poc05 import clusters2dict
    #+from src.link_grammar.poc05 import category_learner
    #+from src.link_grammar.poc05 import induce_grammar
    from src.utl.write_files import list2file, save_link_grammar, save_cat_tree
    from src.utl.widgets import html_table, plot2d
    from src.grammar_learner.generalization import generalize_categories, \
        reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\

    log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'})

    #TODO: save kwargs?

    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    if os.path.isdir(output_categories):
        prj_dir = output_categories
    else:
        prj_dir = os.path.dirname(output_categories)
    log.update({'project_directory': prj_dir})
    #-Save a copy of input parses to prj_dir + '/parses/'  #FIXME:DEL?    #80704
    #-parse_dir = prj_dir + '/parses/'
    #-if check_dir(parse_dir, True, verbose):
    #-    for file in files: copy(file, os.path.dirname(parse_dir))
    #-else: raise FileNotFoundError('File not found', input_parses)

    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files

    # files ⇒ links:
    links, re02 = files2links(**kwargs)
    log.update(re02)
    # corpus_stats - implanted in files2links 80605
    list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt')
    log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'})
    if verbose in ['max', 'debug']:
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    # Learn categories: new 80619
    categories, re03 = category_learner(links, **kwargs)  #v.0.5 categories: {}
    log.update(re03)

    # Generalize categories   #TODO? "gen_cats" ⇒ "categories"? no new name
    if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)')
        gen_cats, re04 = generalize_categories(categories, **kwargs)
        log.update(re04)
    elif cats_gen == 'cosine' or (cats_gen == 'auto'
                                  and clustering == 'kmeans'):
        #TODO: vectors g12n
        gen_cats = categories
        log.update({'generalization': 'vector-similarity based - #TODO'})
        if verbose == 'debug':
            print(
                '#TODO: categories generalization based on cosine similarity')
    else:
        gen_cats = categories
        log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)})
        if verbose in ['max', 'debug']:
            print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \
                cats_gen, '⇒ gen_cats = categories')

    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL?   #80704
    #-re05 = save_cat_tree(gen_cats, output_categories, verbose)
    #-log.update({'category_tree_file': re05['cat_tree_file']})
    # Save cats.pkl
    #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704
    #-    pickle.dump(gen_cats, f)
    #-if verbose in ['max','debug']:
    #-    print(UTC(),':: learn_grammar: 1st cat_tree saved')

    # Learn grammar     #80623

    if grammar_rules != context:
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)')
        links, re06 = files2links(**kwargs)
        kwargs['context'] = context

    # add disjuncts to categories {}  after k-means clustering  #TOEO: speed!
    def add_disjuncts(cats, links, verbose='none'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']),
                  'clusters')
        from copy import deepcopy
        top_clusters = [i for i,x in enumerate(cats['cluster']) \
                        if i > 0 and x is not None]
        word_clusters = dict()
        for i in top_clusters:
            for word in cats['words'][i]:
                word_clusters[word] = i
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: word_clusters:',
                  len(word_clusters), 'words')
        df = links.copy()
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: df[links] = [[x] for x in df[link]]')
        df['links'] = [[x] for x in df['link']]
        if verbose in ['max', 'debug']:
            print(
                UTC(),
                ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])'
            )
        df['cluster'] = df['word'].apply(lambda x: word_clusters[x])
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: cdf = df.groupby("cluster").agg(...')
        cdf = df.groupby('cluster').agg({
            'links': 'sum',
            'count': 'sum'
        }).reset_index()
        #TODO? del df[...] to free RAM?
        disjuncts = [[]] + cdf['links'].tolist()
        counts = [0] + cdf['count'].tolist()
        if verbose in ['max', 'debug']:
            print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \
                  len(rules['cluster']), len(disjuncts), '\ncounts:', counts)
        fat_cats = deepcopy(cats)
        fat_cats['counts'] = [0] + cdf['count'].tolist()
        fat_cats['disjuncts'] = [[]] + cdf['links'].tolist()
        #['djs']
        djset = set()
        [[djset.add(y) for y in x] for x in fat_cats['disjuncts']]
        djlist = sorted(djset)
        fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                           for y in fat_cats['disjuncts']]
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: return fat_cats')
        return fat_cats

    #TODO: def djs? vectors(disjuncts, **kwargs)

    #if context < 2 and grammar_rules > 1:
    if word_space == 'vectors' or clustering == 'kmeans':
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ add_disjuncts')
            #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601
            #    pickle.dump(gen_cats, f)

        fat_cats = add_disjuncts(gen_cats, links)
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar: back from add_disjuncts')
        #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs)   #TODO:
    else:
        fat_cats = gen_cats

    # Learn Grammar
    #+from src.grammar_learner.poc05 import induce_grammar
    rules, re07 = induce_grammar(fat_cats, links)
    if verbose == 'debug':
        print('induce_grammar ⇒ rules:')
        display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
            + [x for i,x in enumerate(cats2list(rules))]))

    # Generalize grammar rules
    gen_rules = rules
    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            from src.grammar_learner.generalization import generalize_rules
            gen_rules, re08 = generalize_rules(rules, **kwargs)
            log.update(re08)
            if verbose == 'debug':
                print('generalize_rules ⇒ gen_rules:')
                display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
                    + [x for i,x in enumerate(cats2list(gen_rules))]))

    # Save cat_tree.txt file
    #^from src.utl.write_files import save_cat_tree
    re09 = save_cat_tree(gen_rules, output_categories,
                         verbose='none')  #FIXME: verbose?
    #TODO: check file save error?
    log.update(re09)
    # Save Link Grammar .dict
    re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules)
    log.update(re10)
    log.update({'finish': str(UTC())})

    #TODO: elapsed execution time?  Save log?

    return log