def learn_lexical_entries(input_dir, cat_path, dict_path, verbose='none', \ parse_mode='given'): from IPython.display import display from src.utl.turtle import html_table from src.utl.read_files import check_dir, check_dir_files, check_corpus from src.utl.write_files import list2file from src.link_grammar.turtle import \ files2disjuncts, lexical_entries, entries2clusters, entries2categories, \ disjuncts2clusters, entries2rules, save_link_grammar if check_dir(input_dir, create=False, verbose=verbose): files = check_dir_files(input_dir, verbose=verbose) if len(files) > 0: if verbose == 'max': print(files) for i, file in enumerate(files): if check_corpus(file, verbose=verbose): if verbose == 'max': print('File #' + str(i), file, 'checked') else: print('File #' + str(i), file, 'check failed') else: print('Input directory', input_dir, 'is empty') else: print('No input directory', input_dir) log = {'project': 'Grammar Learner -- Lexical entries'} # use OR DEL? disjuncts = files2disjuncts(files, 'LEFT-WALL', True, verbose) #TODO: parse_mode? entries = lexical_entries(disjuncts) category_list = entries2categories(entries) if verbose == 'max': display( html_table( [['Parent', 'Category', 'Quality', 'Words', 'Relevance']] + category_list)) if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) if verbose == 'max': for line in categories.splitlines()[:3]: print(line) print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to', cat_file) lg_rule_list = entries2rules(disjuncts2clusters(entries2clusters(entries))) if verbose == 'max': display( html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] + lg_rule_list)) lg_rules_str = save_link_grammar(lg_rule_list, dict_path) if verbose == 'max': for line in lg_rules_str.splitlines(): print(line) #-return categories, lg_rules_dict #TODO: return paths to categories and dict? s = lg_rules_str.splitlines()[-1] lg_file = s[s.find(': ') + 2:] response = {'categories_file': cat_file, 'grammar_file': lg_file} return response
def save_cat_tree(cats, output_categories, verbose='none'): #80706 0.5 #80611 ~ cats2list without 'djs', children'... # cats: {'cluster':[], 'words':[], ...} #80609 from copy import deepcopy from src.utl.write_files import list2file from src.utl.utl import UTC tree_file = output_categories if '.' not in tree_file: #auto file name if tree_file[-1] != '/': tree_file += '/' #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt') n_cats = len( [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1]) tree_file += (str(n_cats) + '_cat_tree.txt') categories = [] for i, cluster in enumerate(cats['cluster']): if i == 0: continue category = [] if cats['cluster'][i] is not None: category.append(cats['cluster'][i]) else: category.append('') category.append(cats['parent'][i]) category.append(i) category.append(round(cats['quality'][i], 2)) #!category.append(sorted(cats['words'][i])) #80704+06 tmp hack FIXME wordz = deepcopy(sorted(cats['words'][i])) #-80704 word@1, word@2 ⇒ word.a, word.b: #-wordz = [x.replace('@1','.a') for x in wordz] #-wordz = [x.replace('@2','.b') for x in wordz] #-wordz = [x.replace('@3','.c') for x in wordz] wordz = [x.replace('@', '.') for x in wordz] #80706 WSD: word@1 ⇒ word.1 category.append(wordz) #80704+06 tmp hack FIXME #80704+06 end category.append(cats['similarities'][i]) #-category.append(cats['children'][i]) categories.append(category) string = list2file(categories, tree_file) if verbose in ['max', 'debug']: print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \ len(cats['cluster']) - 1, 'categories') if verbose == 'debug': from src.utl.widgets import html_table from IPython.display import display display( html_table( [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] + categories)) return {'cat_tree_file': tree_file}
def save_category_tree(category_list, tree_file, verbose='none'): #80522 import os cats = category_list clusters = {} m = 0 for i, x in enumerate(cats): if x[0] not in clusters: clusters[x[0]] = [] clusters[x[0]].append(i) if x[2] > m: m = x[2] tree = [] for k, v in clusters.items(): if len(v) == 1: tree.append(cats[v[0]]) elif len(v) > 1: words = [] similarities = [] for j in v: words.extend(cats[j][4]) similarities.extend(cats[j][5]) tree.append( [cats[v[0]][0], 0, m + 1, cats[v[0]][3], words, similarities]) for j in v: tree.append([ '', m + 1, cats[j][2], cats[j][3], cats[j][4], cats[j][5] ]) else: print('WTF?', k, v) if verbose in ['max', 'debug']: from src.utl.widgets import html_table from IPython.display import display display( html_table( [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] + tree)) from src.utl.write_files import list2file #-tree_file = os.path.dirname(cat_file) + '/cat_tree.txt' string = list2file(tree, tree_file) return {'tree_file': tree_file}
def category_learner(links, \ cat_path, dict_path, tmpath = '', verbose = 'none', \ parse_mode = 'given', left_wall = '', period = False, \ context = 1, window = 'mst', weighting = 'ppmi', group = True, \ word_space = 'vectors', dim_max = 100, sv_min = 0.1, dim_reduction = 'svm', \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9): #, \ #-generalization = 'off', merge = 0.8, aggregate = 0.2, grammar_rules = 1): from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc03 import number_of_clusters, clusters2list #80422 from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Grammar Learner v.0.3 2018-04-11', \ 'date': str(UTC()), 'project_dir': dict_path} '''TODO: log: dict ⇒ list [[]]? / OrderedDict?''' if tmpath == '': tmpath = dict_path # temporary files path if verbose == 'debug': print('category_learner: word_space:', word_space, '/ clustering:', clustering) if word_space == 'vectors': #^from src.space.hyperwords import vector_space_dim, pmisvd #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt vdf, sv, res3 = pmisvd(links, tmpath, tmpath, dim) log.update(res3) elif verbose in ['max', 'debug']: print('category_learner: word_space:', word_space, '/ clustering:', clustering) if clustering == 'kmeans': #^from src.clustering.kmeans import cluster_words_kmeans #^from src.clustering.poc03 import number_of_clusters, clusters2list n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose not in ['min', 'none']: print('/poc03/category_learner: number of clusters =', n_clusters) if verbose in ['max', 'debug']: print(clusters.applymap(round2)) if verbose in ['max', 'debug']: #80412 hack: plots for AGI-2018 :( if context == 1: #FIXME:DEL? plot2d(1, 2, clusters, 'cluster_words', 10) else: if len(clusters) < 6: plot2d(1, 3, clusters, 'cluster_words', 10) else: plot2d(1, 4, clusters, 'cluster_words', 10) elif clustering[:5] in ['group', 'ident']: if verbose in ['max', 'debug']: print('clustering:', clustering) #TODO: from src.clustering.grouping import group_links clusters = group_links(links, verbose) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) if verbose in ['max', 'debug']: print('\n', clusters[['cluster_words', 'disjuncts']]) #.head(12)) # Generalization #TODO next week # Save categories #^from src.clustering.poc import clusters2list #^from src.utl.write_files import list2file category_list = clusters2list(clusters) if verbose not in ['min', 'none']: display(html_table([['Parent','Category','Quality','Words','Relevance']] \ + category_list)) '''TODO: categories file name''' if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + str(len(clusters)) + '_categories.txt' categories = list2file(category_list, cat_file) '''TODO: category file path ⇒ log''' if verbose in ['debug']: print('\nWord categories:\n') for line in categories.splitlines()[:3]: print(line) if verbose not in ['min', 'none']: print('\nCategory list -', len(categories.splitlines()), 'lines, saved to', cat_file) return category_list, log
def learn_connectors(input_dir, cat_path, dict_path, verbose='none', \ parse_mode='given', \ word_space = 'hyperwords', dim_max = 100, sv_min = 0.1, \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = ''): from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.utl.read_files import check_mst_files from src.space.poc import files2links from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc import number_of_clusters, clusters2list from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Unified Grammar Learner: Clustering words', \ 'date': str(UTC()), 'project_dir': dict_path, 'input_dir': input_dir } """TODO: dict ⇒ list [[]] / OrderedDict?""" files, response = check_mst_files(input_dir, verbose='none') links = files2links(files, parse_mode='given', context=1, group = True, \ left_wall='LEFT-WALL', period=True, verbose='none') # vector_space_dim(links, path, tmpath, dim_max=100, sv_min=0.9, 'max') if tmpath == '': tmpath = dict_path dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose not in ['none', 'min']: print('Optimal vector space dimensionality:', dim) vdf, sv, res2 = pmisvd(links, dict_path, tmpath, dim) log.update(res2) #-vdf.applymap(round2).sort_values(by=[1,2,3], ascending=[False,False,False]) n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) if verbose not in ['none', 'min']: print('Optimal number of clusters:', n_clusters) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose in ['max', 'debug']: plot2d(1, 2, clusters, 'cluster_words', 10) # Generalisation - just histogram? - Grammar-Learner-Clustering-Words 2.6 import numpy as np from src.clustering.similarity import cluster_similarity sim_df, res3 = cluster_similarity(clusters, 'max') log.update(res3) if verbose in ['max', 'debug']: count, division = np.histogram(sim_df['similarity']) sim_df['similarity'].hist(bins=division) print('Cluster similarities: absolute values > 0.1:') sim_df.sort_values(by='similarity', ascending=False).loc[(sim_df['similarity']) > 0.1] # Save categories category_list = clusters2list(clusters) if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) if verbose in ['max', 'debug']: for line in categories.splitlines(): print(line) print('<...>\nTotal', len(categories.splitlines()), \ 'lines, saved to', cat_file) #-print(len(categories.splitlines()), 'categories saved to', cat_file) # Grammar Learner lg_rule_list = grammar_learner(clusters, links, verbose) if verbose == 'max': display( html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] + lg_rule_list)) lg_rules_str = save_link_grammar(lg_rule_list, dict_path) if verbose == 'max': for line in lg_rules_str.splitlines(): print(line) #-return categories, lg_rules_dict #TODO: return paths to categories and dict? s = lg_rules_str.splitlines()[-1] lg_file = s[s.find(': ') + 2:] response = {'categories_file': cat_file, 'grammar_file': lg_file} return response
def category_learner(links, \ cat_path, dict_path, verbose='none', \ parse_mode='given', \ word_space = 'vectors', dim_max = 100, sv_min = 0.1, \ dim_reduction = 'svm', \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = '', generalization = 'off', grammar_rules = 'connectors'): # no actual need need for grammar rules here? from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc import number_of_clusters, clusters2list from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Grammar Learner v.0.2 2018-04-06', \ 'date': str(UTC()), 'project_dir': dict_path} '''TODO: log: dict ⇒ list [[]]? / OrderedDict?''' if word_space == 'vectors': if tmpath == '': tmpath = dict_path #^from src.space.hyperwords import vector_space_dim, pmisvd dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) log.update(res3) else: #TODO: word_space = 'discrete'... if tmpath == '': tmpath = dict_path dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) log.update(res3) # Clustering #-clustering = 'group' if clustering == 'kmeans': #^from src.clustering.poc import number_of_clusters, clusters2list n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose not in ['min', 'none']: print('Optimal number of clusters:', n_clusters) if verbose == 'max': plot2d(1, 2, clusters, 'cluster_words', 10) elif clustering[:5] in ['group', 'ident']: #80606 test ~OK def group_links(links): #+TODO: old code ⇒ here ⇒ src.clustering.group_links.py #-Old way: from src.link_grammar.turtle import lexical_entries, entries2clusters djs = links.rename(columns={'link': 'disjunct'}) #-clusters = entries2clusters(lexical_entries(djs)) entries = lexical_entries(djs) clusters = entries2clusters(entries).rename( columns={'germs': 'cluster_words'}) return clusters #+from ... import group links clusters = group_links(links) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) if verbose == 'max': print('\n', clusters[['cluster_words', 'disjuncts']]) #.head(12)) # Generalization #TODO next week # Save categories #^from src.clustering.poc import clusters2list #^from src.utl.write_files import list2file category_list = clusters2list(clusters) if verbose not in ['min', 'none']: display(html_table([['Parent','Category','Quality','Words','Relevance']] \ + category_list)) '''TODO: categories file name''' if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) '''TODO: category file path ⇒ log''' if verbose == 'max': for line in categories.splitlines()[:3]: print(line) if verbose != 'none': print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to', cat_file) return category_list, log
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') print('learn_grammar: grammar_rules:', grammar_rules) #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os, pickle #, collections from collections import OrderedDict import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc05 import files2links #80528 .poc05 from src.clustering.poc05 import clusters2dict #+from src.link_grammar.poc05 import category_learner #+from src.link_grammar.poc05 import induce_grammar from src.utl.write_files import list2file, save_link_grammar, save_cat_tree from src.utl.widgets import html_table, plot2d from src.grammar_learner.generalization import generalize_categories, \ reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\ log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'}) #TODO: save kwargs? files, re01 = check_mst_files(input_parses, verbose) log.update(re01) if os.path.isdir(output_categories): prj_dir = output_categories else: prj_dir = os.path.dirname(output_categories) log.update({'project_directory': prj_dir}) #-Save a copy of input parses to prj_dir + '/parses/' #FIXME:DEL? #80704 #-parse_dir = prj_dir + '/parses/' #-if check_dir(parse_dir, True, verbose): #- for file in files: copy(file, os.path.dirname(parse_dir)) #-else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files # files ⇒ links: links, re02 = files2links(**kwargs) log.update(re02) # corpus_stats - implanted in files2links 80605 list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt') log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'}) if verbose in ['max', 'debug']: print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) # Learn categories: new 80619 categories, re03 = category_learner(links, **kwargs) #v.0.5 categories: {} log.update(re03) # Generalize categories #TODO? "gen_cats" ⇒ "categories"? no new name if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'): if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)') gen_cats, re04 = generalize_categories(categories, **kwargs) log.update(re04) elif cats_gen == 'cosine' or (cats_gen == 'auto' and clustering == 'kmeans'): #TODO: vectors g12n gen_cats = categories log.update({'generalization': 'vector-similarity based - #TODO'}) if verbose == 'debug': print( '#TODO: categories generalization based on cosine similarity') else: gen_cats = categories log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)}) if verbose in ['max', 'debug']: print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \ cats_gen, '⇒ gen_cats = categories') # Save 1st cats_file = to control 2-step generalization #FIXME:DEL? #80704 #-re05 = save_cat_tree(gen_cats, output_categories, verbose) #-log.update({'category_tree_file': re05['cat_tree_file']}) # Save cats.pkl #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704 #- pickle.dump(gen_cats, f) #-if verbose in ['max','debug']: #- print(UTC(),':: learn_grammar: 1st cat_tree saved') # Learn grammar #80623 if grammar_rules != context: context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)') links, re06 = files2links(**kwargs) kwargs['context'] = context # add disjuncts to categories {} after k-means clustering #TOEO: speed! def add_disjuncts(cats, links, verbose='none'): if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']), 'clusters') from copy import deepcopy top_clusters = [i for i,x in enumerate(cats['cluster']) \ if i > 0 and x is not None] word_clusters = dict() for i in top_clusters: for word in cats['words'][i]: word_clusters[word] = i if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: word_clusters:', len(word_clusters), 'words') df = links.copy() if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: df[links] = [[x] for x in df[link]]') df['links'] = [[x] for x in df['link']] if verbose in ['max', 'debug']: print( UTC(), ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])' ) df['cluster'] = df['word'].apply(lambda x: word_clusters[x]) if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cdf = df.groupby("cluster").agg(...') cdf = df.groupby('cluster').agg({ 'links': 'sum', 'count': 'sum' }).reset_index() #TODO? del df[...] to free RAM? disjuncts = [[]] + cdf['links'].tolist() counts = [0] + cdf['count'].tolist() if verbose in ['max', 'debug']: print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \ len(rules['cluster']), len(disjuncts), '\ncounts:', counts) fat_cats = deepcopy(cats) fat_cats['counts'] = [0] + cdf['count'].tolist() fat_cats['disjuncts'] = [[]] + cdf['links'].tolist() #['djs'] djset = set() [[djset.add(y) for y in x] for x in fat_cats['disjuncts']] djlist = sorted(djset) fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in fat_cats['disjuncts']] if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: return fat_cats') return fat_cats #TODO: def djs? vectors(disjuncts, **kwargs) #if context < 2 and grammar_rules > 1: if word_space == 'vectors' or clustering == 'kmeans': if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ add_disjuncts') #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601 # pickle.dump(gen_cats, f) fat_cats = add_disjuncts(gen_cats, links) if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar: back from add_disjuncts') #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs) #TODO: else: fat_cats = gen_cats # Learn Grammar #+from src.grammar_learner.poc05 import induce_grammar rules, re07 = induce_grammar(fat_cats, links) if verbose == 'debug': print('induce_grammar ⇒ rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(rules))])) # Generalize grammar rules gen_rules = rules if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) from src.grammar_learner.generalization import generalize_rules gen_rules, re08 = generalize_rules(rules, **kwargs) log.update(re08) if verbose == 'debug': print('generalize_rules ⇒ gen_rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(gen_rules))])) # Save cat_tree.txt file #^from src.utl.write_files import save_cat_tree re09 = save_cat_tree(gen_rules, output_categories, verbose='none') #FIXME: verbose? #TODO: check file save error? log.update(re09) # Save Link Grammar .dict re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules) log.update(re10) log.update({'finish': str(UTC())}) #TODO: elapsed execution time? Save log? return log