コード例 #1
0
def grammar_learner(clusters, links, **kwargs):
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    #-clusters        = kwargs['word_clusters']
    #-links           = kwargs['links']
    grammar_rules = kwa(1, 'grammar_rules')
    verbose = kwa('none', 'verbose')

    if verbose == 'debug': print('\ngrammar_learner rules =', grammar_rules)
    from src.grammar_learner.poc04 import links2stalks
    from src.utl.turtle import html_table

    stalks = links2stalks(links, clusters, grammar_rules, verbose)
    rules = stalks.groupby('cluster') \
        .agg({'words': 'sum', 'disjuncts': 'sum', 'count': 'sum'}).reset_index()
    rules['disjuncts'] = rules['disjuncts'].apply(lambda x: sorted(set(x)))
    if verbose == 'debug': print('\nrules', type(rules), '\n', rules)
    #FIXME: remove duplicate disjuncts!
    rule_list = list()
    for row in rules.itertuples():
        rule = []
        rule.append(row[1])  # Cluster
        rule.append(row[2])  # Words
        rule.append([])  # Left Connectors
        rule.append([])  # Right Connectors
        rule.append(row[3])  # Disjuncts
        rule_list.append(rule)
    rule_list.sort()
    if verbose not in ['min', 'none']:
        display(
            html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] +
                       rule_list))

    return rule_list, {'rule_list': len(rule_list)}
コード例 #2
0
def save_cat_tree(cats, output_categories, verbose='none'):  #80706 0.5
    #80611 ~ cats2list without 'djs', children'...
    # cats: {'cluster':[], 'words':[], ...}                     #80609
    from copy import deepcopy
    from src.utl.write_files import list2file
    from src.utl.utl import UTC

    tree_file = output_categories
    if '.' not in tree_file:  #auto file name
        if tree_file[-1] != '/': tree_file += '/'
        #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt')
        n_cats = len(
            [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1])
        tree_file += (str(n_cats) + '_cat_tree.txt')

    categories = []
    for i, cluster in enumerate(cats['cluster']):
        if i == 0: continue
        category = []
        if cats['cluster'][i] is not None:
            category.append(cats['cluster'][i])
        else:
            category.append('')
        category.append(cats['parent'][i])
        category.append(i)
        category.append(round(cats['quality'][i], 2))
        #!category.append(sorted(cats['words'][i]))  #80704+06 tmp hack FIXME
        wordz = deepcopy(sorted(cats['words'][i]))
        #-80704 word@1, word@2 ⇒ word.a, word.b:
        #-wordz = [x.replace('@1','.a') for x in wordz]
        #-wordz = [x.replace('@2','.b') for x in wordz]
        #-wordz = [x.replace('@3','.c') for x in wordz]
        wordz = [x.replace('@', '.')
                 for x in wordz]  #80706 WSD: word@1 ⇒ word.1
        category.append(wordz)  #80704+06 tmp hack FIXME
        #80704+06 end
        category.append(cats['similarities'][i])
        #-category.append(cats['children'][i])
        categories.append(category)

    string = list2file(categories, tree_file)

    if verbose in ['max', 'debug']:
        print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \
            len(cats['cluster']) - 1, 'categories')
    if verbose == 'debug':
        from src.utl.widgets import html_table
        from IPython.display import display
        display(
            html_table(
                [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] +
                categories))

    return {'cat_tree_file': tree_file}
コード例 #3
0
def save_category_tree(category_list, tree_file, verbose='none'):  #80522
    import os
    cats = category_list
    clusters = {}
    m = 0
    for i, x in enumerate(cats):
        if x[0] not in clusters: clusters[x[0]] = []
        clusters[x[0]].append(i)
        if x[2] > m: m = x[2]
    tree = []
    for k, v in clusters.items():
        if len(v) == 1:
            tree.append(cats[v[0]])
        elif len(v) > 1:
            words = []
            similarities = []
            for j in v:
                words.extend(cats[j][4])
                similarities.extend(cats[j][5])
            tree.append(
                [cats[v[0]][0], 0, m + 1, cats[v[0]][3], words, similarities])
            for j in v:
                tree.append([
                    '', m + 1, cats[j][2], cats[j][3], cats[j][4], cats[j][5]
                ])
        else:
            print('WTF?', k, v)
    if verbose in ['max', 'debug']:
        from src.utl.widgets import html_table
        from IPython.display import display
        display(
            html_table(
                [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] +
                tree))

    from src.utl.write_files import list2file
    #-tree_file = os.path.dirname(cat_file) + '/cat_tree.txt'
    string = list2file(tree, tree_file)

    return {'tree_file': tree_file}
コード例 #4
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    #80509 kwargs tests ~OK
    #-print('poc04 learn_grammar kwargs:')
    #-for k,v in kwargs.items(): print(('- '+k+':                ')[:20], v)
    #-response = print_kwargs(**kwargs)
    #-return response
    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os  #, collections
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc04 import files2links
    #+from src.link_grammar.poc04 import category_learner
    from src.clustering.poc04 import clusters2dict
    #+from src.link_grammar.poc04 import grammar_learner
    #-from src.link_grammar.poc import save_link_grammar
    from src.utl.write_files import list2file, save_link_grammar
    from src.utl.widgets import html_table, plot2d

    from collections import OrderedDict
    log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'})
    #log.update({'datime': str(UTC()), 'learn_grammar': '80510'})
    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    #for file in files: copy(file, output_categories)
    #TODO: output_categories file ⇒ dir
    if os.path.isdir(output_categories):
        parse_dir = output_categories + '/parses/'
    else:
        parse_dir = os.path.dirname(output_categories) + '/parses/'
    if check_dir(parse_dir, True, verbose):
        for file in files:
            copy(file, os.path.dirname(parse_dir))
    else:
        raise FileNotFoundError('File not found', input_parses)
    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files
    links, re02 = files2links(**kwargs)
    log.update(re02)
    if verbose == 'debug':
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    category_list, re03 = category_learner(links, **kwargs)
    log.update(re03)
    word_clusters = clusters2dict(category_list)
    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL
    cats_file = output_categories
    if '.' not in cats_file:  #80508 auto file name
        if cats_file[-1] != '/': cats_file += '/'
        cats_file += (str(len(set([x[0] for x in category_list]))) +
                      '_categories.txt')
    #TODO: comment saving cats_file and run tests 80523
    #+categories = list2file(category_list, cats_file)
    log.update({'categories_file': cats_file})
    #...TODO... hierarchical categories  80523 snooze
    #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
    #...        + category_list))

    if grammar_rules != context:
        #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose)
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        links, re04 = files2links(**kwargs)
        kwargs['context'] = context

    rule_list, re05 = grammar_learner(word_clusters, links, **kwargs)
    log.update(re05)
    #...display(html_table([['Rules','','','','','']] + rule_list))

    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            cats_list, re06 = generalise_rules(rule_list, **kwargs)
            #TODO: = generalise_rules(rule_list, **kwargs)
            log.update(re06)
            if len(set([x[0] for x in cats_list])) < len(
                    set([x[0] for x in category_list])):
                category_list = cats_list
                # Save 2nd cats_file - overwrite in case of equal
                cats_file = output_categories
                if '.' not in cats_file:  #80508 auto file name
                    if cats_file[-1] != '/': cats_file += '/'
                    cats_file += (str(len(set([x[0]
                                               for x in category_list]))) +
                                  '_categories.txt')
                #TODO: comment saving cats_file and run tests 80523
                #+categories = list2file(category_list, cats_file)
                log.update({'categories_file': cats_file})
                word_clusters = clusters2dict(category_list)
                rule_list, re07 = grammar_learner(word_clusters, links,
                                                  **kwargs)
                #...display(html_table([['Rules','','','','','']] + rule_list))
                log.update(re07)
                if verbose == 'debug':
                    print('\nrules_generalisation ⇒ category_list:',
                          category_list)
    if verbose not in ['min', 'none']:
        display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
            + category_list))

    # Save cat_tree.txt file
    from src.utl.write_files import save_category_tree
    tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt'
    re08 = save_category_tree(category_list, tree_file,
                              verbose)  #FIXME: verbose?
    log.update(re08)
    # Save Link Grammar .dict
    re09 = save_link_grammar(rule_list, output_grammar)
    log.update(re09)

    return log
コード例 #5
0
def category_learner(links, **kwargs):  #80509+10
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    #-links = kwargs['links']   # links - check?
    cats_file = kwa('/output', 'output_categories')  # to define tmpath
    #-dict_path       = kwa('/output', 'output_grammar')   # not used here
    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    cluster_range = kwa((2, 48, 1), 'cluster_range')
    cluster_criteria = kwa('silhouette', 'cluster_criteria')
    cluster_level = kwa(0.9, 'cluster_level')
    generalization = kwa('off', 'categories_generalization')
    merge = kwa(0.8, 'categories_merge')
    aggregate = kwa(0.2, 'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    verbose = kwa('none', 'verbose')

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc04 import number_of_clusters, clusters2list
    from src.utl.widgets import html_table, plot2d
    from src.utl.write_files import list2file
    from src.link_grammar.poc import save_link_grammar

    from collections import OrderedDict
    log = OrderedDict()
    log.update({'category_learner': '80525'})

    if tmpath == '' or tmpath == 'auto':  # temporary files path
        if '.' not in cats_file: tmpath = cats_file
        else: tmpath = cats_file[:cats_file.rindex('/')]
        if tmpath[-1] != '/': tmpath += '/'
        tmpath += 'tmp'

    #-print('poc04.category_learner: tmpath = ', tmpath)

    if verbose == 'debug':
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if word_space == 'vectors':
        #^from src.space.hyperwords import vector_space_dim, pmisvd
        #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose)
        log.update({'vector_space_dim': dim})
        if verbose == 'min': print('Optimal vector space dimensionality:', dim)
        #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)
        log.update(re01)
    elif verbose in ['max', 'debug']:
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    if clustering == 'kmeans':
        #^from src.clustering.kmeans import cluster_words_kmeans
        #^from src.clustering.poc03 import number_of_clusters, clusters2list
        n_clusters = number_of_clusters(vdf, cluster_range ,clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        if verbose not in ['min', 'none']:
            print('/poc04/category_learner: number of clusters =', n_clusters)
        if verbose in ['max', 'debug']: print(clusters.applymap(round2))
        if verbose in ['max', 'debug']:  #80412 hack: plots for AGI-2018 :(
            if context == 1:  #FIXME:DEL?
                plot2d(1, 2, clusters, 'cluster_words', 10)
            else:
                if len(clusters) < 6:
                    plot2d(1, 3, clusters, 'cluster_words', 10)
                else:
                    plot2d(1, 4, clusters, 'cluster_words', 10)

    elif clustering[:5] in ['group', 'ident']:
        if verbose in ['max', 'debug']: print('clustering:', clustering)
        #TODO: from src.clustering.grouping import group_links
        clusters = group_links(links, verbose)
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))
        if verbose in ['max', 'debug']:
            print('\n', clusters[['cluster_words', 'disjuncts']])  #.head(12))

    # Generalization = word categories aggregation

    if generalization in ['auto', 'jaccard', 'cosine']:
        #-print('generalization:', generalization)
        #-categories, res2 = aggregate_word_categories(clusters, \
        #-    generalization, merge, aggregate, grammar_rules, verbose)
        categories, re02 = aggregate_word_categories(clusters,
                                                     **kwargs)  #80510
        log.update(re02)
        #-print(len(categories), 'categories', type(categories), '\n', categories)
    else:
        categories = clusters
        #-print('generalization:', generalization, '⇒ categories = clusters')

    #-Save categories #TODO: return & save outside?
    #^from src.clustering.poc04 import clusters2list
    #^from src.utl.write_files import list2file
    category_list = clusters2list(categories)
    if verbose not in ['min', 'none']:
        display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
            + category_list))
    '''80522: save file outside
    if '.' not in cats_file:  #80508 auto file name
        if cats_file[-1] != '/': cats_file += '/'
        cats_file += (str(len(set(categories['cluster'].tolist()))) + '_categories.txt')
    categories = list2file(category_list, cats_file)
    log.update({'categories_file': cats_file})
    '''
    if verbose in ['debug']:
        print('\nWord categories:\n', categories)
        #for line in categories.splitlines()[:3]: print(line)
    if verbose not in ['min', 'none']:
        print('\nCategory list -', len(categories), 'lines, saved to',
              cats_file)

    return category_list, log
コード例 #6
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    print('learn_grammar: grammar_rules:', grammar_rules)

    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os, pickle  #, collections
    from collections import OrderedDict
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc05 import files2links  #80528 .poc05
    from src.clustering.poc05 import clusters2dict
    #+from src.link_grammar.poc05 import category_learner
    #+from src.link_grammar.poc05 import induce_grammar
    from src.utl.write_files import list2file, save_link_grammar, save_cat_tree
    from src.utl.widgets import html_table, plot2d
    from src.grammar_learner.generalization import generalize_categories, \
        reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\

    log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'})

    #TODO: save kwargs?

    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    if os.path.isdir(output_categories):
        prj_dir = output_categories
    else:
        prj_dir = os.path.dirname(output_categories)
    log.update({'project_directory': prj_dir})
    #-Save a copy of input parses to prj_dir + '/parses/'  #FIXME:DEL?    #80704
    #-parse_dir = prj_dir + '/parses/'
    #-if check_dir(parse_dir, True, verbose):
    #-    for file in files: copy(file, os.path.dirname(parse_dir))
    #-else: raise FileNotFoundError('File not found', input_parses)

    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files

    # files ⇒ links:
    links, re02 = files2links(**kwargs)
    log.update(re02)
    # corpus_stats - implanted in files2links 80605
    list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt')
    log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'})
    if verbose in ['max', 'debug']:
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    # Learn categories: new 80619
    categories, re03 = category_learner(links, **kwargs)  #v.0.5 categories: {}
    log.update(re03)

    # Generalize categories   #TODO? "gen_cats" ⇒ "categories"? no new name
    if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)')
        gen_cats, re04 = generalize_categories(categories, **kwargs)
        log.update(re04)
    elif cats_gen == 'cosine' or (cats_gen == 'auto'
                                  and clustering == 'kmeans'):
        #TODO: vectors g12n
        gen_cats = categories
        log.update({'generalization': 'vector-similarity based - #TODO'})
        if verbose == 'debug':
            print(
                '#TODO: categories generalization based on cosine similarity')
    else:
        gen_cats = categories
        log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)})
        if verbose in ['max', 'debug']:
            print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \
                cats_gen, '⇒ gen_cats = categories')

    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL?   #80704
    #-re05 = save_cat_tree(gen_cats, output_categories, verbose)
    #-log.update({'category_tree_file': re05['cat_tree_file']})
    # Save cats.pkl
    #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704
    #-    pickle.dump(gen_cats, f)
    #-if verbose in ['max','debug']:
    #-    print(UTC(),':: learn_grammar: 1st cat_tree saved')

    # Learn grammar     #80623

    if grammar_rules != context:
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)')
        links, re06 = files2links(**kwargs)
        kwargs['context'] = context

    # add disjuncts to categories {}  after k-means clustering  #TOEO: speed!
    def add_disjuncts(cats, links, verbose='none'):
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']),
                  'clusters')
        from copy import deepcopy
        top_clusters = [i for i,x in enumerate(cats['cluster']) \
                        if i > 0 and x is not None]
        word_clusters = dict()
        for i in top_clusters:
            for word in cats['words'][i]:
                word_clusters[word] = i
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: word_clusters:',
                  len(word_clusters), 'words')
        df = links.copy()
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: df[links] = [[x] for x in df[link]]')
        df['links'] = [[x] for x in df['link']]
        if verbose in ['max', 'debug']:
            print(
                UTC(),
                ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])'
            )
        df['cluster'] = df['word'].apply(lambda x: word_clusters[x])
        if verbose in ['max', 'debug']:
            print(UTC(),
                  ':: add_disjuncts: cdf = df.groupby("cluster").agg(...')
        cdf = df.groupby('cluster').agg({
            'links': 'sum',
            'count': 'sum'
        }).reset_index()
        #TODO? del df[...] to free RAM?
        disjuncts = [[]] + cdf['links'].tolist()
        counts = [0] + cdf['count'].tolist()
        if verbose in ['max', 'debug']:
            print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \
                  len(rules['cluster']), len(disjuncts), '\ncounts:', counts)
        fat_cats = deepcopy(cats)
        fat_cats['counts'] = [0] + cdf['count'].tolist()
        fat_cats['disjuncts'] = [[]] + cdf['links'].tolist()
        #['djs']
        djset = set()
        [[djset.add(y) for y in x] for x in fat_cats['disjuncts']]
        djlist = sorted(djset)
        fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                           for y in fat_cats['disjuncts']]
        if verbose in ['max', 'debug']:
            print(UTC(), ':: add_disjuncts: return fat_cats')
        return fat_cats

    #TODO: def djs? vectors(disjuncts, **kwargs)

    #if context < 2 and grammar_rules > 1:
    if word_space == 'vectors' or clustering == 'kmeans':
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar ⇒ add_disjuncts')
            #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601
            #    pickle.dump(gen_cats, f)

        fat_cats = add_disjuncts(gen_cats, links)
        if verbose in ['max', 'debug']:
            print(UTC(), ':: learn_grammar: back from add_disjuncts')
        #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs)   #TODO:
    else:
        fat_cats = gen_cats

    # Learn Grammar
    #+from src.grammar_learner.poc05 import induce_grammar
    rules, re07 = induce_grammar(fat_cats, links)
    if verbose == 'debug':
        print('induce_grammar ⇒ rules:')
        display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
            + [x for i,x in enumerate(cats2list(rules))]))

    # Generalize grammar rules
    gen_rules = rules
    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            from src.grammar_learner.generalization import generalize_rules
            gen_rules, re08 = generalize_rules(rules, **kwargs)
            log.update(re08)
            if verbose == 'debug':
                print('generalize_rules ⇒ gen_rules:')
                display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
                    + [x for i,x in enumerate(cats2list(gen_rules))]))

    # Save cat_tree.txt file
    #^from src.utl.write_files import save_cat_tree
    re09 = save_cat_tree(gen_rules, output_categories,
                         verbose='none')  #FIXME: verbose?
    #TODO: check file save error?
    log.update(re09)
    # Save Link Grammar .dict
    re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules)
    log.update(re10)
    log.update({'finish': str(UTC())})

    #TODO: elapsed execution time?  Save log?

    return log
コード例 #7
0
def induce_grammar(categories,
                   links,
                   verbose='none'):  #80620 learn_grammar replacement
    # categories: {'cluster': [], 'words': [], ...}
    # links: pd.DataFrame (legacy)
    from src.grammar_learner.generalization import cats2list
    import copy
    rules = copy.deepcopy(categories)

    clusters = [
        i for i, x in enumerate(rules['cluster']) if i > 0 and x is not None
    ]
    word_clusters = dict()
    for i in clusters:
        for word in rules['words'][i]:
            word_clusters[word] = i

    if verbose == 'debug':
        print('induce_grammar: rules.keys():', rules.keys())
        print('induce_grammar: clusters:', clusters)
        print('induce_grammar: word_clusters:', word_clusters)
        print('induce_grammar: rules ~ categories:')
        display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
            + [x for i,x in enumerate(cats2list(rules)) if i < 4]))

    for cluster in clusters:
        djs = []
        for rule in categories['disjuncts'][
                cluster]:  #FIXME: categories ⇒ rules 80621
            # 'a- & was-' ⇒ (-9,-26)
            #+TODO? (-x,-y,z) ⇒ (-x,z), (-y,z) ?
            if type(rule) is str:
                x = rule.split()
                dj = []
                for y in x:
                    if y not in ['&', ' ', '']:
                        if y[-1] == '+':
                            dj.append(word_clusters[y[:-1]])
                        elif y[-1] == '-':
                            dj.append(-1 * word_clusters[y[:-1]])
                        else:
                            print('no sign?', dj)  #TODO:ERROR?
                djs.append(tuple(dj))
                if verbose == 'debug':
                    print('induce_gramma: cluster', cluster, '::', rule, '⇒',
                          tuple(dj))
            #TODO? +elif type(rule) is tuple? connectors - tuples?
        rules['disjuncts'][cluster] = set(djs)
        if verbose == 'debug':
            print('induce_grammar: rules["disjuncts"][' + str(cluster) + ']',
                  rules['disjuncts'][cluster])
    #rules['djs'] = copy.deepcopy(rules['disjuncts'])  #TODO: check jaccard with tuples else replace with numbers

    if verbose == 'debug':
        print('induce_grammar: updated disjuncts:')
        from IPython.display import display
        from src.utl.widgets import html_table
        display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \
            + [x for i,x in enumerate(cats2list(rules)) if i < 32]))

    return rules, {'learned_rules': \
                    len([x for i,x in enumerate(rules['parent']) if x==0 and i>0]), \
                   'total_clusters': len(rules['cluster']) - 1}