Exemple #1
0
def print_kwargs(**kwargs):
    from src.utl.utl import UTC
    print('poc04 learn_grammar kwargs:')
    for k, v in kwargs.items():
        print(('- ' + k + ':                ')[:20], v)
    kwargs['printed'] = str(UTC())
    return kwargs
 def test_turtle_generalize_both(self):
     base  = module_path + '/tests/data/POC-Turtle/' + \
         'generalized_categories_and_rules/dict_6C_2018-07-06_0005.4.0.dict'
     #'generalized_categories_and_rules/poc-turtle_6C_2018-06-08_0004.4.0.dict'
     input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/'
     batch_dir = module_path + '/output/Test_Grammar_Learner_' + str(
         UTC())[:10] + '/'
     prj_dir = batch_dir + 'generalized_categories_and_rules/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     kwargs = {
         'left_wall': 'LEFT-WALL',
         'period': True,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'jaccard',
         'rules_generalization': 'jaccard',
         'tmpath': module_path + '/tmp/',
         'verbose': 'none'
     }
     response = learn_grammar(input_parses, outpath, outpath, **kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f:
         lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         assert rule_list == base_list
     else:
         assert len(rule_list) == len(base_list)
Exemple #3
0
def save_cat_tree(cats, output_categories, verbose='none'):  #80706 0.5
    #80611 ~ cats2list without 'djs', children'...
    # cats: {'cluster':[], 'words':[], ...}                     #80609
    from copy import deepcopy
    from src.utl.write_files import list2file
    from src.utl.utl import UTC

    tree_file = output_categories
    if '.' not in tree_file:  #auto file name
        if tree_file[-1] != '/': tree_file += '/'
        #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt')
        n_cats = len(
            [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1])
        tree_file += (str(n_cats) + '_cat_tree.txt')

    categories = []
    for i, cluster in enumerate(cats['cluster']):
        if i == 0: continue
        category = []
        if cats['cluster'][i] is not None:
            category.append(cats['cluster'][i])
        else:
            category.append('')
        category.append(cats['parent'][i])
        category.append(i)
        category.append(round(cats['quality'][i], 2))
        #!category.append(sorted(cats['words'][i]))  #80704+06 tmp hack FIXME
        wordz = deepcopy(sorted(cats['words'][i]))
        #-80704 word@1, word@2 ⇒ word.a, word.b:
        #-wordz = [x.replace('@1','.a') for x in wordz]
        #-wordz = [x.replace('@2','.b') for x in wordz]
        #-wordz = [x.replace('@3','.c') for x in wordz]
        wordz = [x.replace('@', '.')
                 for x in wordz]  #80706 WSD: word@1 ⇒ word.1
        category.append(wordz)  #80704+06 tmp hack FIXME
        #80704+06 end
        category.append(cats['similarities'][i])
        #-category.append(cats['children'][i])
        categories.append(category)

    string = list2file(categories, tree_file)

    if verbose in ['max', 'debug']:
        print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \
            len(cats['cluster']) - 1, 'categories')
    if verbose == 'debug':
        from src.utl.widgets import html_table
        from IPython.display import display
        display(
            html_table(
                [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] +
                categories))

    return {'cat_tree_file': tree_file}
Exemple #4
0
 def add_disjuncts(cats, links, verbose='none'):
     if verbose in ['max', 'debug']:
         print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']),
               'clusters')
     from copy import deepcopy
     top_clusters = [i for i,x in enumerate(cats['cluster']) \
                     if i > 0 and x is not None]
     word_clusters = dict()
     for i in top_clusters:
         for word in cats['words'][i]:
             word_clusters[word] = i
     if verbose in ['max', 'debug']:
         print(UTC(), ':: add_disjuncts: word_clusters:',
               len(word_clusters), 'words')
     df = links.copy()
     if verbose in ['max', 'debug']:
         print(UTC(),
               ':: add_disjuncts: df[links] = [[x] for x in df[link]]')
     df['links'] = [[x] for x in df['link']]
     if verbose in ['max', 'debug']:
         print(
             UTC(),
             ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])'
         )
     df['cluster'] = df['word'].apply(lambda x: word_clusters[x])
     if verbose in ['max', 'debug']:
         print(UTC(),
               ':: add_disjuncts: cdf = df.groupby("cluster").agg(...')
     cdf = df.groupby('cluster').agg({
         'links': 'sum',
         'count': 'sum'
     }).reset_index()
     #TODO? del df[...] to free RAM?
     disjuncts = [[]] + cdf['links'].tolist()
     counts = [0] + cdf['count'].tolist()
     if verbose in ['max', 'debug']:
         print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \
               len(rules['cluster']), len(disjuncts), '\ncounts:', counts)
     fat_cats = deepcopy(cats)
     fat_cats['counts'] = [0] + cdf['count'].tolist()
     fat_cats['disjuncts'] = [[]] + cdf['links'].tolist()
     #['djs']
     djset = set()
     [[djset.add(y) for y in x] for x in fat_cats['disjuncts']]
     djlist = sorted(djset)
     fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                        for y in fat_cats['disjuncts']]
     if verbose in ['max', 'debug']:
         print(UTC(), ':: add_disjuncts: return fat_cats')
     return fat_cats
 def test_turtle_diled(self):
     corpus = 'POC-Turtle'
     dataset = 'MST_fixed_manually'
     input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/'
     base  = module_path + '/tests/data/POC-Turtle/' + \
         '/2018-04-25/turtle_dILEd_LW+dot+_2018-04-25_0008.4.0.dict'
     batch_dir = module_path + '/output/Test_Grammar_Learner_' + str(
         UTC())[:10] + '/'
     prj_dir = batch_dir + 'Turtle_dILEd_LW_and_period/'
     if check_dir(prj_dir, create=True, verbose='max'):
         output_categories = prj_dir
         output_grammar = prj_dir
     kwargs = {
         'left_wall': 'LEFT-WALL',
         'period': True,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'verbose': 'min'
     }
     response = learn_grammar(input_parses, output_categories,
                              output_grammar, **kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f:
         lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         if kwargs['verbose'] == 'debug':
             print('\nTest results vs baseline:')
             for i, rule in enumerate(base_list):
                 print(rule_list[i])
                 print(rule)
         assert rule_list == base_list
     else:
         assert len(rule_list) == len(base_list)
Exemple #6
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    #80509 kwargs tests ~OK
    #-print('poc04 learn_grammar kwargs:')
    #-for k,v in kwargs.items(): print(('- '+k+':                ')[:20], v)
    #-response = print_kwargs(**kwargs)
    #-return response
    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os  #, collections
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc04 import files2links
    #+from src.link_grammar.poc04 import category_learner
    from src.clustering.poc04 import clusters2dict
    #+from src.link_grammar.poc04 import grammar_learner
    #-from src.link_grammar.poc import save_link_grammar
    from src.utl.write_files import list2file, save_link_grammar
    from src.utl.widgets import html_table, plot2d

    from collections import OrderedDict
    log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'})
    #log.update({'datime': str(UTC()), 'learn_grammar': '80510'})
    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    #for file in files: copy(file, output_categories)
    #TODO: output_categories file ⇒ dir
    if os.path.isdir(output_categories):
        parse_dir = output_categories + '/parses/'
    else:
        parse_dir = os.path.dirname(output_categories) + '/parses/'
    if check_dir(parse_dir, True, verbose):
        for file in files:
            copy(file, os.path.dirname(parse_dir))
    else:
        raise FileNotFoundError('File not found', input_parses)
    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files
    links, re02 = files2links(**kwargs)
    log.update(re02)
    if verbose == 'debug':
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    category_list, re03 = category_learner(links, **kwargs)
    log.update(re03)
    word_clusters = clusters2dict(category_list)
    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL
    cats_file = output_categories
    if '.' not in cats_file:  #80508 auto file name
        if cats_file[-1] != '/': cats_file += '/'
        cats_file += (str(len(set([x[0] for x in category_list]))) +
                      '_categories.txt')
    #TODO: comment saving cats_file and run tests 80523
    #+categories = list2file(category_list, cats_file)
    log.update({'categories_file': cats_file})
    #...TODO... hierarchical categories  80523 snooze
    #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
    #...        + category_list))

    if grammar_rules != context:
        #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose)
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        links, re04 = files2links(**kwargs)
        kwargs['context'] = context

    rule_list, re05 = grammar_learner(word_clusters, links, **kwargs)
    log.update(re05)
    #...display(html_table([['Rules','','','','','']] + rule_list))

    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            cats_list, re06 = generalise_rules(rule_list, **kwargs)
            #TODO: = generalise_rules(rule_list, **kwargs)
            log.update(re06)
            if len(set([x[0] for x in cats_list])) < len(
                    set([x[0] for x in category_list])):
                category_list = cats_list
                # Save 2nd cats_file - overwrite in case of equal
                cats_file = output_categories
                if '.' not in cats_file:  #80508 auto file name
                    if cats_file[-1] != '/': cats_file += '/'
                    cats_file += (str(len(set([x[0]
                                               for x in category_list]))) +
                                  '_categories.txt')
                #TODO: comment saving cats_file and run tests 80523
                #+categories = list2file(category_list, cats_file)
                log.update({'categories_file': cats_file})
                word_clusters = clusters2dict(category_list)
                rule_list, re07 = grammar_learner(word_clusters, links,
                                                  **kwargs)
                #...display(html_table([['Rules','','','','','']] + rule_list))
                log.update(re07)
                if verbose == 'debug':
                    print('\nrules_generalisation ⇒ category_list:',
                          category_list)
    if verbose not in ['min', 'none']:
        display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
            + category_list))

    # Save cat_tree.txt file
    from src.utl.write_files import save_category_tree
    tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt'
    re08 = save_category_tree(category_list, tree_file,
                              verbose)  #FIXME: verbose?
    log.update(re08)
    # Save Link Grammar .dict
    re09 = save_link_grammar(rule_list, output_grammar)
    log.update(re09)

    return log
 def setUp(self):  #FIXME: should run before every test, but would not?!
     #-import os, sys
     # Paths #FIXME: don't run?
     #module_path = os.path.abspath(os.path.join('..'))
     #if module_path not in sys.path:
     #    sys.path.append(module_path)
     # Imports - moved up:  #FIXME: don't run here?
     #-from src.grammar_learner.poc05 import learn_grammar
     #-from src.utl.utl import UTC
     src_path = module_path + '/src'
     if os.path.exists(src_path) and src_path not in sys.path:
         sys.path.append(src_path)
     # Don't need link grammar paths with new (June 2018) Grammar Tester (?)
     #-lg_path = '/home/oleg/miniconda3/envs/ull4/lib/python3.6/site-packages/linkgrammar'
     #-if os.path.exists(lg_path) and lg_path not in sys.path:
     #-    sys.path.append(lg_path)
     #-link_grammar_path = module_path + '/src/link_grammar'
     #-if os.path.exists(link_grammar_path) and link_grammar_path not in sys.path:
     #-    sys.path.append(link_grammar_path)
     input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/'
     batch_dir = module_path + '/output/Test_Grammar_Learner_' + str(
         UTC())[:10] + '/'
     # Grammar Learner 0.5 parameters:
     # input_parses, output_categories, output_grammar, **kwargs
     kwargs = {  # defaults  #FIXME: don't pass to tests (should?)
         'parse_mode': 'given',  # 'given' (default) / 'explosive' (next)
         'left_wall':
         'LEFT-WALL',  # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL###
         'period': True,  # use period in links learning: True/False
         'context': 2,  # 1: connectors / 2,3...: disjuncts
         'window':
         'mst',  # 'mst' / reserved options for «explosive» parsing
         'weighting': 'ppmi',  # 'ppmi' / future options
         'group': True,  # group items after link parsing
         'distance': False,  # reserved options for «explosive» parsing
         'word_space':
         'discrete',  # 'vectors' / 'discrete' - no dimensionality reduction
         'dim_max': 100,  # max vector space dimensionality
         'sv_min':
         0.1,  # minimal singular value (fraction of the max value)
         'dim_reduction':
         'none',  # 'svm' / 'none' (discrete word_space, group)
         'clustering':
         'group',  # 'kmeans' / 'group'~'identical_entries' / future options
         'cluster_range': (2, 48, 1),  # min, max, step
         'cluster_criteria': 'silhouette',  # optimal clustering criteria
         'cluster_level':
         0.9,  # level = 0, 1, 0.-0.99..: 0 - max number of clusters
         'categories_generalization':
         'off',  # 'off' / 'cosine' - cosine similarity, 'jaccard'
         'categories_merge':
         0.8,  # merge categories with similarity > this 'merge' criteria
         'categories_aggregation':
         0.2,  # aggregate categories with similarity > this criteria
         'grammar_rules':
         2,  # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
         'rules_generalization':
         'off',  # 'off' / 'cosine' - cosine similarity, 'jaccard'
         'rules_merge':
         0.8,  # merge rules with similarity > this 'merge' criteria
         'rules_aggregation':
         0.2,  # aggregate rules similarity > this criteria
         'tmpath': module_path + '/tmp/',
         'verbose':
         'min',  # display intermediate results: 'none', 'min', 'mid', 'max'
         # Additional (optional) parameters for parse_metrics (_abiity & _quality):
         'test_corpus':
         module_path + '/data/POC-Turtle/poc-turtle-corpus.txt',
         'reference_path':
         module_path + '/data/POC-Turtle/poc-turtle-parses-expected.txt',
         'template_path':
         'poc-turtle',  #FIXME: changed in June 2018 Grammar Tester
         'linkage_limit': 1
     }
     pass
def category_learner(links, \
    cat_path, dict_path, tmpath = '', verbose = 'none', \
    parse_mode = 'given', left_wall = '', period = False, \
    context = 1, window = 'mst', weighting = 'ppmi', group = True, \
    word_space = 'vectors', dim_max = 100, sv_min = 0.1,
    dim_reduction = 'svm', \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9): #, \
    #-generalization = 'off', merge = 0.8, aggregate = 0.2, grammar_rules = 1):

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc03 import number_of_clusters, clusters2list  #80422
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Grammar Learner v.0.3 2018-04-11', \
           'date': str(UTC()), 'project_dir': dict_path}
    '''TODO: log: dict ⇒ list [[]]? / OrderedDict?'''

    if tmpath == '': tmpath = dict_path  # temporary files path
    if verbose == 'debug':
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if word_space == 'vectors':
        #^from src.space.hyperwords import vector_space_dim, pmisvd
        #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        vdf, sv, res3 = pmisvd(links, tmpath, tmpath, dim)
        log.update(res3)
    elif verbose in ['max', 'debug']:
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if clustering == 'kmeans':
        #^from src.clustering.kmeans import cluster_words_kmeans
        #^from src.clustering.poc03 import number_of_clusters, clusters2list
        n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        if verbose not in ['min', 'none']:
            print('/poc03/category_learner: number of clusters =', n_clusters)
        if verbose in ['max', 'debug']: print(clusters.applymap(round2))
        if verbose in ['max', 'debug']:  #80412 hack: plots for AGI-2018 :(
            if context == 1:  #FIXME:DEL?
                plot2d(1, 2, clusters, 'cluster_words', 10)
            else:
                if len(clusters) < 6:
                    plot2d(1, 3, clusters, 'cluster_words', 10)
                else:
                    plot2d(1, 4, clusters, 'cluster_words', 10)

    elif clustering[:5] in ['group', 'ident']:
        if verbose in ['max', 'debug']: print('clustering:', clustering)
        #TODO: from src.clustering.grouping import group_links
        clusters = group_links(links, verbose)
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))
        if verbose in ['max', 'debug']:
            print('\n', clusters[['cluster_words', 'disjuncts']])  #.head(12))

    # Generalization  #TODO next week

    # Save categories

    #^from src.clustering.poc import clusters2list
    #^from src.utl.write_files import list2file
    category_list = clusters2list(clusters)
    if verbose not in ['min', 'none']:
        display(html_table([['Parent','Category','Quality','Words','Relevance']] \
            + category_list))
    '''TODO: categories file name'''

    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + str(len(clusters)) + '_categories.txt'
    categories = list2file(category_list, cat_file)
    '''TODO: category file path ⇒ log'''

    if verbose in ['debug']:
        print('\nWord categories:\n')
        for line in categories.splitlines()[:3]:
            print(line)
    if verbose not in ['min', 'none']:
        print('\nCategory list -', len(categories.splitlines()),
              'lines, saved to', cat_file)

    return category_list, log
Exemple #9
0
def save_link_grammar(rules,
                      output_grammar,
                      grammar_rules=2,
                      header='',
                      footer=''):  #80626
    # rules: [] or {} -
    # grammar_rules = kwargs['grammar_rules']: 1 ⇒ connectors, 2+ ⇒ disjuncts
    import os
    from ..utl.utl import UTC
    #-if path[-1] != '/': path += '/'

    if type(
            rules
    ) is dict:  #80620 0.5 new data structure, 80626 connector-based rules
        rules = rules2list(rules, grammar_rules)

    link_grammar = ''  #80510 0.4
    line_list = list()
    clusters = set()
    for rule in rules:
        line = ''
        if len(rule[2]) > 0 and len(rule[3]) > 0:
            line += '{' + ' or '.join(str(x) for x in rule[2]) \
                + '} & {' +  ' or '.join(str(y) for y in rule[3]) + '}'
        else:
            if len(rule[2]) > 0:
                line += ' or '.join('(' + str(x) + ')' for x in rule[2])
            elif len(rule[3]) > 0:
                line += ' or '.join('(' + str(x) + ')' for x in rule[3])
        if len(rule[4]) > 0:
            if line != '': line += ' or '
            line += ' or '.join('(' + str(x) + ')' for x in rule[4])

        cluster_number = '% ' + str(rule[0]) + '\n'  # comment line: cluster
        cluster_and_words = ' '.join('"' + word + '"'
                                     for word in rule[1]) + ':\n'
        line_list.append(cluster_number + cluster_and_words + line + ';\n')
        clusters.add(rule[0])

    line_list.sort()  #FIXME: overkill?
    #TODO: file naming - corpus name?
    #-if file != '': out_file = path + file
    if os.path.isfile(output_grammar):
        out_file = output_grammar
    elif os.path.isdir(output_grammar):
        out_file = output_grammar
        if out_file[-1] != '/': out_file += '/'
        #-if 'isa' in '\t'.join(line_list): out_file += 'poc-turtle_'
        #-else: out_file += 'poc-english_'
        #out_file += 'poc-english_'   #80704 replaced with:
        out_file += 'dict_'
        out_file = out_file + str(len(clusters)) + 'C_' \
            + str(UTC())[:10] + '_0005.4.0.dict'            #80620 0004⇒0005
    else:
        raise FileNotFoundError('File not found', output_grammar)
    if header == '':
        header = '% Grammar Learner v.0.5 ' + str(UTC())  #80620 .4⇒.5
    header = header + '\n' + '<dictionary-version-number>: V0v0v5+;\n' \
        + '<dictionary-locale>: EN4us+;'
    add_rules = 'UNKNOWN-WORD: XXX+;'
    if footer == '':
        footer = '% '+ str(len(clusters)) + ' word clusters, ' \
            + str(len(rules)) + ' Link Grammar rules.\n' \
            + '% Link Grammar file saved to: ' + out_file
    lg = header + '\n\n' + '\n'.join(
        line_list) + '\n' + add_rules + '\n\n' + footer
    #-80704 tmp FIXME:
    #-lg = lg.replace('@1', '.a')
    #-lg = lg.replace('@2', '.b')
    #-lg = lg.replace('@3', '.c')
    lg = lg.replace('@', '.')  #8070 WSD: word@1 ⇒ word.1
    with open(out_file, 'w') as f:
        f.write(lg)

    from collections import OrderedDict
    response = OrderedDict({'grammar_file': out_file})
    response.update({
        'grammar_clusters': len(clusters),
        'grammar_rules': len(rules)
    })
    return response
def learn_connectors(input_dir, cat_path, dict_path, verbose='none', \
    parse_mode='given', \
    word_space = 'hyperwords', dim_max = 100, sv_min = 0.1, \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = ''):

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.utl.read_files import check_mst_files
    from src.space.poc import files2links
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc import number_of_clusters, clusters2list
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Unified Grammar Learner: Clustering words', \
           'date': str(UTC()), 'project_dir': dict_path, 'input_dir': input_dir }
    """TODO: dict ⇒ list [[]] / OrderedDict?"""

    files, response = check_mst_files(input_dir, verbose='none')
    links = files2links(files, parse_mode='given', context=1, group = True, \
                        left_wall='LEFT-WALL', period=True, verbose='none')

    # vector_space_dim(links, path, tmpath, dim_max=100, sv_min=0.9, 'max')
    if tmpath == '': tmpath = dict_path
    dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
    log.update({'vector_space_dim': dim})
    if verbose not in ['none', 'min']:
        print('Optimal vector space dimensionality:', dim)

    vdf, sv, res2 = pmisvd(links, dict_path, tmpath, dim)
    log.update(res2)
    #-vdf.applymap(round2).sort_values(by=[1,2,3], ascending=[False,False,False])

    n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
        criteria=cluster_criteria, level=cluster_level, verbose=verbose)
    if verbose not in ['none', 'min']:
        print('Optimal number of clusters:', n_clusters)

    clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
    if verbose in ['max', 'debug']: plot2d(1, 2, clusters, 'cluster_words', 10)

    # Generalisation - just histogram? - Grammar-Learner-Clustering-Words 2.6
    import numpy as np
    from src.clustering.similarity import cluster_similarity
    sim_df, res3 = cluster_similarity(clusters, 'max')
    log.update(res3)
    if verbose in ['max', 'debug']:
        count, division = np.histogram(sim_df['similarity'])
        sim_df['similarity'].hist(bins=division)
        print('Cluster similarities: absolute values > 0.1:')
        sim_df.sort_values(by='similarity',
                           ascending=False).loc[(sim_df['similarity']) > 0.1]

    # Save categories
    category_list = clusters2list(clusters)
    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    if verbose in ['max', 'debug']:
        for line in categories.splitlines():
            print(line)
        print('<...>\nTotal', len(categories.splitlines()), \
              'lines, saved to', cat_file)
    #-print(len(categories.splitlines()), 'categories saved to', cat_file)

    # Grammar Learner
    lg_rule_list = grammar_learner(clusters, links, verbose)
    if verbose == 'max':
        display(
            html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] +
                       lg_rule_list))
    lg_rules_str = save_link_grammar(lg_rule_list, dict_path)
    if verbose == 'max':
        for line in lg_rules_str.splitlines():
            print(line)
    #-return categories, lg_rules_dict
    #TODO: return paths to categories and dict?
    s = lg_rules_str.splitlines()[-1]
    lg_file = s[s.find(': ') + 2:]
    response = {'categories_file': cat_file, 'grammar_file': lg_file}
    return response
Exemple #11
0
def category_learner(links, \
    cat_path, dict_path, verbose='none', \
    parse_mode='given', \
    word_space = 'vectors', dim_max = 100, sv_min = 0.1, \
    dim_reduction = 'svm', \
    clustering = 'kmeans', cluster_range = (2,48,1), \
    cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = '',
    generalization = 'off',
    grammar_rules = 'connectors'):  # no actual need need for grammar rules here?

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc import number_of_clusters, clusters2list
    from src.utl.turtle import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    log = {'project': 'Grammar Learner v.0.2 2018-04-06', \
           'date': str(UTC()), 'project_dir': dict_path}
    '''TODO: log: dict ⇒ list [[]]? / OrderedDict?'''

    if word_space == 'vectors':
        if tmpath == '': tmpath = dict_path
        #^from src.space.hyperwords import vector_space_dim, pmisvd
        dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min,
                               verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        log.update(res3)
    else:
        #TODO: word_space = 'discrete'...
        if tmpath == '': tmpath = dict_path
        dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min,
                               verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        log.update(res3)

    # Clustering
    #-clustering = 'group'

    if clustering == 'kmeans':
        #^from src.clustering.poc import number_of_clusters, clusters2list
        n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        if verbose not in ['min', 'none']:
            print('Optimal number of clusters:', n_clusters)
        if verbose == 'max': plot2d(1, 2, clusters, 'cluster_words', 10)

    elif clustering[:5] in ['group', 'ident']:  #80606 test ~OK

        def group_links(links):
            #+TODO: old code ⇒ here  ⇒ src.clustering.group_links.py
            #-Old way:
            from src.link_grammar.turtle import lexical_entries, entries2clusters
            djs = links.rename(columns={'link': 'disjunct'})
            #-clusters = entries2clusters(lexical_entries(djs))
            entries = lexical_entries(djs)
            clusters = entries2clusters(entries).rename(
                columns={'germs': 'cluster_words'})
            return clusters

        #+from ... import group links
        clusters = group_links(links)
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))
        if verbose == 'max':
            print('\n', clusters[['cluster_words', 'disjuncts']])  #.head(12))

    # Generalization  #TODO next week

    # Save categories

    #^from src.clustering.poc import clusters2list
    #^from src.utl.write_files import list2file
    category_list = clusters2list(clusters)
    if verbose not in ['min', 'none']:
        display(html_table([['Parent','Category','Quality','Words','Relevance']] \
            + category_list))
    '''TODO: categories file name'''

    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    '''TODO: category file path ⇒ log'''

    if verbose == 'max':
        for line in categories.splitlines()[:3]:
            print(line)
    if verbose != 'none':
        print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to',
              cat_file)

    return category_list, log
Exemple #12
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    print('learn_grammar: grammar_rules:', grammar_rules)

    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os, pickle  #, collections
    from collections import OrderedDict
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc05 import files2links  #80528 .poc05
    from src.clustering.poc05 import clusters2dict
    #+from src.link_grammar.poc05 import category_learner
    #+from src.link_grammar.poc05 import induce_grammar
    from src.utl.write_files import list2file, save_link_grammar, save_cat_tree
    from src.utl.widgets import html_table, plot2d
    from src.grammar_learner.generalization import generalize_categories, \
        reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\

    log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'})

    #TODO: save kwargs?

    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    if os.path.isdir(output_categories):
        prj_dir = output_categories
    else:
        prj_dir = os.path.dirname(output_categories)
    log.update({'project_directory': prj_dir})
    #-Save a copy of input parses to prj_dir + '/parses/'  #FIXME:DEL?    #80704
    #-parse_dir = prj_dir + '/parses/'
    #-if check_dir(parse_dir, True, verbose):
    #-    for file in files: copy(file, os.path.dirname(parse_dir))
    #-else: raise FileNotFoundError('File not found', input_parses)

    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files

    # files ⇒ links:
    links, re02 = files2links(**kwargs)
    log.update(re02)
    # corpus_stats - implanted in files2links 80605
    list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt')
    log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'})
    if verbose in ['max', 'debug']:
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    # Learn categories: new 80619
    categories, re03 = category_learner(links, **kwargs)  #v.0.5 categories: {}
    log.update(re03)

    # Generalize categories   #TODO? "gen_cats" ⇒ "categories"? no new name
    if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)')
        gen_cats, re04 = generalize_categories(categories, **kwargs)
        log.update(re04)
    elif cats_gen == 'cosine' or (cats_gen == 'auto'
                                  and clustering == 'kmeans'):
        #TODO: vectors g12n
        gen_cats = categories
        log.update({'generalization': 'vector-similarity based - #TODO'})
        if verbose == 'debug':
            print(
                '#TODO: categories generalization based on cosine similarity')
    else:
        gen_cats = categories
        log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)})
        if verbose in ['max', 'debug']:
            print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \
                cats_gen, '⇒ gen_cats = categories')

    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL?   #80704
    #-re05 = save_cat_tree(gen_cats, output_categories, verbose)
    #-log.update({'category_tree_file': re05['cat_tree_file']})
    # Save cats.pkl
    #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704
    #-    pickle.dump(gen_cats, f)
    #-if verbose in ['max','debug']:
    #-    print(UTC(),':: learn_grammar: 1st cat_tree saved')

    # Learn grammar     #80623

    if grammar_rules != context:
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)')
        links, re06 = files2links(**kwargs)
        kwargs['context'] = context

    # add disjuncts to categories {}  after k-means clustering  #TOEO: speed!
    def add_disjuncts(cats, links, verbose='none'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']),
                  'clusters')
        from copy import deepcopy
        top_clusters = [i for i,x in enumerate(cats['cluster']) \
                        if i > 0 and x is not None]
        word_clusters = dict()
        for i in top_clusters:
            for word in cats['words'][i]:
                word_clusters[word] = i
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: word_clusters:',
                  len(word_clusters), 'words')
        df = links.copy()
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: df[links] = [[x] for x in df[link]]')
        df['links'] = [[x] for x in df['link']]
        if verbose in ['max', 'debug']:
            print(
                UTC(),
                ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])'
            )
        df['cluster'] = df['word'].apply(lambda x: word_clusters[x])
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: cdf = df.groupby("cluster").agg(...')
        cdf = df.groupby('cluster').agg({
            'links': 'sum',
            'count': 'sum'
        }).reset_index()
        #TODO? del df[...] to free RAM?
        disjuncts = [[]] + cdf['links'].tolist()
        counts = [0] + cdf['count'].tolist()
        if verbose in ['max', 'debug']:
            print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \
                  len(rules['cluster']), len(disjuncts), '\ncounts:', counts)
        fat_cats = deepcopy(cats)
        fat_cats['counts'] = [0] + cdf['count'].tolist()
        fat_cats['disjuncts'] = [[]] + cdf['links'].tolist()
        #['djs']
        djset = set()
        [[djset.add(y) for y in x] for x in fat_cats['disjuncts']]
        djlist = sorted(djset)
        fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                           for y in fat_cats['disjuncts']]
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: return fat_cats')
        return fat_cats

    #TODO: def djs? vectors(disjuncts, **kwargs)

    #if context < 2 and grammar_rules > 1:
    if word_space == 'vectors' or clustering == 'kmeans':
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ add_disjuncts')
            #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601
            #    pickle.dump(gen_cats, f)

        fat_cats = add_disjuncts(gen_cats, links)
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar: back from add_disjuncts')
        #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs)   #TODO:
    else:
        fat_cats = gen_cats

    # Learn Grammar
    #+from src.grammar_learner.poc05 import induce_grammar
    rules, re07 = induce_grammar(fat_cats, links)
    if verbose == 'debug':
        print('induce_grammar ⇒ rules:')
        display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
            + [x for i,x in enumerate(cats2list(rules))]))

    # Generalize grammar rules
    gen_rules = rules
    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            from src.grammar_learner.generalization import generalize_rules
            gen_rules, re08 = generalize_rules(rules, **kwargs)
            log.update(re08)
            if verbose == 'debug':
                print('generalize_rules ⇒ gen_rules:')
                display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
                    + [x for i,x in enumerate(cats2list(gen_rules))]))

    # Save cat_tree.txt file
    #^from src.utl.write_files import save_cat_tree
    re09 = save_cat_tree(gen_rules, output_categories,
                         verbose='none')  #FIXME: verbose?
    #TODO: check file save error?
    log.update(re09)
    # Save Link Grammar .dict
    re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules)
    log.update(re10)
    log.update({'finish': str(UTC())})

    #TODO: elapsed execution time?  Save log?

    return log
Exemple #13
0
def category_learner(links, **kwargs):  #80619 POC.0.5
    # links - DataFrame ['word', 'link', 'count']
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    #-links = kwargs['links']   # links - check?
    cats_file = kwa('/output', 'output_categories')  # to define tmpath
    #-dict_path       = kwa('/output', 'output_grammar')   # not used here
    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    cluster_range = kwa((2, 48, 1), 'cluster_range')
    cluster_criteria = kwa('silhouette', 'cluster_criteria')
    cluster_level = kwa(0.9, 'cluster_level')
    generalization = kwa('off', 'categories_generalization')
    merge = kwa(0.8, 'categories_merge')
    aggregate = kwa(0.2, 'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    verbose = kwa('none', 'verbose')

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc05 import number_of_clusters, clusters2list
    from src.utl.widgets import html_table, plot2d
    from src.utl.read_files import check_dir  #, check_mst_files
    from src.utl.write_files import list2file, save_link_grammar
    #-from src.grammar_learner.poc05 import group_links, \
    #-    aggregate_cosine, aggregate_jaccard, aggregate_word_categories

    from collections import OrderedDict
    log = OrderedDict()
    log.update({'category_learner': '80619'})

    if tmpath == '' or tmpath == 'auto':  # temporary files path
        if '.' not in cats_file: tmpath = cats_file
        else: tmpath = cats_file[:cats_file.rindex('/')]
        if tmpath[-1] != '/': tmpath += '/'
        tmpath += 'tmp/'
        print('tmpath:', tmpath)
    if check_dir(tmpath, True, verbose):
        log.update({'tmpath': tmpath})
    #TODO:ERROR

    if verbose == 'debug':
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    #-if word_space == 'vectors':    #80619 Category-Tree-2018-06-19.ipynb
    if context == 1 or word_space[0] in ['v', 'e'] or clustering == 'kmeans':
        #word_space options: v,e: 'vectors'='embeddings', d,w: 'discrete'='word_vectors'
        print('DRK: context =',
              str(context) + ', word_space: ' + word_space + ', clustering:',
              clustering)
        #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose)
        log.update({'vector_space_dim': dim})
        if verbose in ['mid', 'max', 'debug']:
            print('Optimal vector space dimensionality:', dim)
        #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)
        log.update(re01)
        #-if clustering == 'kmeans':
        if verbose in ['max', 'debug']:
            print(UTC(), ':: category_learner ⇒ number_of_clusters')
        n_clusters = number_of_clusters(vdf, cluster_range, clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        log.update({'n_clusters': n_clusters})
        if verbose in ['max', 'debug']:
            print(UTC(), ':: category_learner ⇒ cluster_words_kmeans:',
                  n_clusters, 'clusters')
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        log.update({'silhouette': silhouette, 'inertia': inertia})
    #-elif clustering[:5] in ['group','ident']:
    else:
        if verbose in ['max', 'debug']:
            print(UTC(),':: category_learner ⇒ iLE group_links: context =', \
                str(context)+', word_space: '+str(word_space)+', clustering:', clustering)
        #TODO: from src.clustering.grouping import group_links
        clusters = group_links(links, verbose)
        log.update({'n_clusters': len(clusters)})
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))

    # Convert clusters DataFrame ⇒ cats {}   #80619 0.5
    #TODO?: if clusters == pd.dataframe:
    if verbose in ['max', 'debug']:
        print(UTC(), ':: category_learner: convert clusters ⇒ cats {}')
    cats = {}  #80609 dict instead of DataFrame
    cats['cluster'] = ['C0'] + clusters['cluster'].tolist()
    cats['parent'] = [0 for x in cats['cluster']]
    cats['words'] = [[]] + [set(x) for x in clusters['cluster_words'].tolist()]
    if 'disjuncts' in clusters:
        cats['disjuncts'] = [[]] + clusters['disjuncts'].tolist()
        djset = set()
        [[djset.add(y) for y in x] for x in cats['disjuncts']]
        djlist = sorted(djset)
        cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                       for y in cats['disjuncts']]
    if 'counts' in clusters:
        cats['counts'] = [0] + clusters['counts'].tolist()
    if word_space == 'vectors' or clustering == 'kmeans':
        cats['quality'] = [0 for x in cats['words']]
        cats['similarities'] = [[0 for y in x] for x in cats['words']]
    else:
        cats['quality'] = [1 for x in cats['words']]
        cats['quality'][0] = 0
        cats['similarities'] = [[1 for y in x] for x in cats['words']]
        cats['similarities'][0] = [0]
    cats['children'] = [0 for x in cats['words']]

    return cats, log