def grammar_learner(clusters, links, **kwargs): def kwa(v, k): return kwargs[k] if k in kwargs else v #-clusters = kwargs['word_clusters'] #-links = kwargs['links'] grammar_rules = kwa(1, 'grammar_rules') verbose = kwa('none', 'verbose') if verbose == 'debug': print('\ngrammar_learner rules =', grammar_rules) from src.grammar_learner.poc04 import links2stalks from src.utl.turtle import html_table stalks = links2stalks(links, clusters, grammar_rules, verbose) rules = stalks.groupby('cluster') \ .agg({'words': 'sum', 'disjuncts': 'sum', 'count': 'sum'}).reset_index() rules['disjuncts'] = rules['disjuncts'].apply(lambda x: sorted(set(x))) if verbose == 'debug': print('\nrules', type(rules), '\n', rules) #FIXME: remove duplicate disjuncts! rule_list = list() for row in rules.itertuples(): rule = [] rule.append(row[1]) # Cluster rule.append(row[2]) # Words rule.append([]) # Left Connectors rule.append([]) # Right Connectors rule.append(row[3]) # Disjuncts rule_list.append(rule) rule_list.sort() if verbose not in ['min', 'none']: display( html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] + rule_list)) return rule_list, {'rule_list': len(rule_list)}
def save_cat_tree(cats, output_categories, verbose='none'): #80706 0.5 #80611 ~ cats2list without 'djs', children'... # cats: {'cluster':[], 'words':[], ...} #80609 from copy import deepcopy from src.utl.write_files import list2file from src.utl.utl import UTC tree_file = output_categories if '.' not in tree_file: #auto file name if tree_file[-1] != '/': tree_file += '/' #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt') n_cats = len( [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1]) tree_file += (str(n_cats) + '_cat_tree.txt') categories = [] for i, cluster in enumerate(cats['cluster']): if i == 0: continue category = [] if cats['cluster'][i] is not None: category.append(cats['cluster'][i]) else: category.append('') category.append(cats['parent'][i]) category.append(i) category.append(round(cats['quality'][i], 2)) #!category.append(sorted(cats['words'][i])) #80704+06 tmp hack FIXME wordz = deepcopy(sorted(cats['words'][i])) #-80704 word@1, word@2 ⇒ word.a, word.b: #-wordz = [x.replace('@1','.a') for x in wordz] #-wordz = [x.replace('@2','.b') for x in wordz] #-wordz = [x.replace('@3','.c') for x in wordz] wordz = [x.replace('@', '.') for x in wordz] #80706 WSD: word@1 ⇒ word.1 category.append(wordz) #80704+06 tmp hack FIXME #80704+06 end category.append(cats['similarities'][i]) #-category.append(cats['children'][i]) categories.append(category) string = list2file(categories, tree_file) if verbose in ['max', 'debug']: print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \ len(cats['cluster']) - 1, 'categories') if verbose == 'debug': from src.utl.widgets import html_table from IPython.display import display display( html_table( [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] + categories)) return {'cat_tree_file': tree_file}
def save_category_tree(category_list, tree_file, verbose='none'): #80522 import os cats = category_list clusters = {} m = 0 for i, x in enumerate(cats): if x[0] not in clusters: clusters[x[0]] = [] clusters[x[0]].append(i) if x[2] > m: m = x[2] tree = [] for k, v in clusters.items(): if len(v) == 1: tree.append(cats[v[0]]) elif len(v) > 1: words = [] similarities = [] for j in v: words.extend(cats[j][4]) similarities.extend(cats[j][5]) tree.append( [cats[v[0]][0], 0, m + 1, cats[v[0]][3], words, similarities]) for j in v: tree.append([ '', m + 1, cats[j][2], cats[j][3], cats[j][4], cats[j][5] ]) else: print('WTF?', k, v) if verbose in ['max', 'debug']: from src.utl.widgets import html_table from IPython.display import display display( html_table( [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] + tree)) from src.utl.write_files import list2file #-tree_file = os.path.dirname(cat_file) + '/cat_tree.txt' string = list2file(tree, tree_file) return {'tree_file': tree_file}
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') #80509 kwargs tests ~OK #-print('poc04 learn_grammar kwargs:') #-for k,v in kwargs.items(): print(('- '+k+': ')[:20], v) #-response = print_kwargs(**kwargs) #-return response #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os #, collections import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc04 import files2links #+from src.link_grammar.poc04 import category_learner from src.clustering.poc04 import clusters2dict #+from src.link_grammar.poc04 import grammar_learner #-from src.link_grammar.poc import save_link_grammar from src.utl.write_files import list2file, save_link_grammar from src.utl.widgets import html_table, plot2d from collections import OrderedDict log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'}) #log.update({'datime': str(UTC()), 'learn_grammar': '80510'}) files, re01 = check_mst_files(input_parses, verbose) log.update(re01) #for file in files: copy(file, output_categories) #TODO: output_categories file ⇒ dir if os.path.isdir(output_categories): parse_dir = output_categories + '/parses/' else: parse_dir = os.path.dirname(output_categories) + '/parses/' if check_dir(parse_dir, True, verbose): for file in files: copy(file, os.path.dirname(parse_dir)) else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files links, re02 = files2links(**kwargs) log.update(re02) if verbose == 'debug': print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) category_list, re03 = category_learner(links, **kwargs) log.update(re03) word_clusters = clusters2dict(category_list) # Save 1st cats_file = to control 2-step generalization #FIXME:DEL cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) #...TODO... hierarchical categories 80523 snooze #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ #... + category_list)) if grammar_rules != context: #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose) context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] links, re04 = files2links(**kwargs) kwargs['context'] = context rule_list, re05 = grammar_learner(word_clusters, links, **kwargs) log.update(re05) #...display(html_table([['Rules','','','','','']] + rule_list)) if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) cats_list, re06 = generalise_rules(rule_list, **kwargs) #TODO: = generalise_rules(rule_list, **kwargs) log.update(re06) if len(set([x[0] for x in cats_list])) < len( set([x[0] for x in category_list])): category_list = cats_list # Save 2nd cats_file - overwrite in case of equal cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) word_clusters = clusters2dict(category_list) rule_list, re07 = grammar_learner(word_clusters, links, **kwargs) #...display(html_table([['Rules','','','','','']] + rule_list)) log.update(re07) if verbose == 'debug': print('\nrules_generalisation ⇒ category_list:', category_list) if verbose not in ['min', 'none']: display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ + category_list)) # Save cat_tree.txt file from src.utl.write_files import save_category_tree tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt' re08 = save_category_tree(category_list, tree_file, verbose) #FIXME: verbose? log.update(re08) # Save Link Grammar .dict re09 = save_link_grammar(rule_list, output_grammar) log.update(re09) return log
def category_learner(links, **kwargs): #80509+10 def kwa(v, k): return kwargs[k] if k in kwargs else v #-links = kwargs['links'] # links - check? cats_file = kwa('/output', 'output_categories') # to define tmpath #-dict_path = kwa('/output', 'output_grammar') # not used here tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') cluster_range = kwa((2, 48, 1), 'cluster_range') cluster_criteria = kwa('silhouette', 'cluster_criteria') cluster_level = kwa(0.9, 'cluster_level') generalization = kwa('off', 'categories_generalization') merge = kwa(0.8, 'categories_merge') aggregate = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') verbose = kwa('none', 'verbose') from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc04 import number_of_clusters, clusters2list from src.utl.widgets import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar from collections import OrderedDict log = OrderedDict() log.update({'category_learner': '80525'}) if tmpath == '' or tmpath == 'auto': # temporary files path if '.' not in cats_file: tmpath = cats_file else: tmpath = cats_file[:cats_file.rindex('/')] if tmpath[-1] != '/': tmpath += '/' tmpath += 'tmp' #-print('poc04.category_learner: tmpath = ', tmpath) if verbose == 'debug': print('category_learner: word_space:', word_space, '/ clustering:', clustering) if word_space == 'vectors': #^from src.space.hyperwords import vector_space_dim, pmisvd #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim) log.update(re01) elif verbose in ['max', 'debug']: print('category_learner: word_space:', word_space, '/ clustering:', clustering) if clustering == 'kmeans': #^from src.clustering.kmeans import cluster_words_kmeans #^from src.clustering.poc03 import number_of_clusters, clusters2list n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose not in ['min', 'none']: print('/poc04/category_learner: number of clusters =', n_clusters) if verbose in ['max', 'debug']: print(clusters.applymap(round2)) if verbose in ['max', 'debug']: #80412 hack: plots for AGI-2018 :( if context == 1: #FIXME:DEL? plot2d(1, 2, clusters, 'cluster_words', 10) else: if len(clusters) < 6: plot2d(1, 3, clusters, 'cluster_words', 10) else: plot2d(1, 4, clusters, 'cluster_words', 10) elif clustering[:5] in ['group', 'ident']: if verbose in ['max', 'debug']: print('clustering:', clustering) #TODO: from src.clustering.grouping import group_links clusters = group_links(links, verbose) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) if verbose in ['max', 'debug']: print('\n', clusters[['cluster_words', 'disjuncts']]) #.head(12)) # Generalization = word categories aggregation if generalization in ['auto', 'jaccard', 'cosine']: #-print('generalization:', generalization) #-categories, res2 = aggregate_word_categories(clusters, \ #- generalization, merge, aggregate, grammar_rules, verbose) categories, re02 = aggregate_word_categories(clusters, **kwargs) #80510 log.update(re02) #-print(len(categories), 'categories', type(categories), '\n', categories) else: categories = clusters #-print('generalization:', generalization, '⇒ categories = clusters') #-Save categories #TODO: return & save outside? #^from src.clustering.poc04 import clusters2list #^from src.utl.write_files import list2file category_list = clusters2list(categories) if verbose not in ['min', 'none']: display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ + category_list)) '''80522: save file outside if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set(categories['cluster'].tolist()))) + '_categories.txt') categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) ''' if verbose in ['debug']: print('\nWord categories:\n', categories) #for line in categories.splitlines()[:3]: print(line) if verbose not in ['min', 'none']: print('\nCategory list -', len(categories), 'lines, saved to', cats_file) return category_list, log
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') print('learn_grammar: grammar_rules:', grammar_rules) #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os, pickle #, collections from collections import OrderedDict import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc05 import files2links #80528 .poc05 from src.clustering.poc05 import clusters2dict #+from src.link_grammar.poc05 import category_learner #+from src.link_grammar.poc05 import induce_grammar from src.utl.write_files import list2file, save_link_grammar, save_cat_tree from src.utl.widgets import html_table, plot2d from src.grammar_learner.generalization import generalize_categories, \ reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\ log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'}) #TODO: save kwargs? files, re01 = check_mst_files(input_parses, verbose) log.update(re01) if os.path.isdir(output_categories): prj_dir = output_categories else: prj_dir = os.path.dirname(output_categories) log.update({'project_directory': prj_dir}) #-Save a copy of input parses to prj_dir + '/parses/' #FIXME:DEL? #80704 #-parse_dir = prj_dir + '/parses/' #-if check_dir(parse_dir, True, verbose): #- for file in files: copy(file, os.path.dirname(parse_dir)) #-else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files # files ⇒ links: links, re02 = files2links(**kwargs) log.update(re02) # corpus_stats - implanted in files2links 80605 list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt') log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'}) if verbose in ['max', 'debug']: print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) # Learn categories: new 80619 categories, re03 = category_learner(links, **kwargs) #v.0.5 categories: {} log.update(re03) # Generalize categories #TODO? "gen_cats" ⇒ "categories"? no new name if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'): if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)') gen_cats, re04 = generalize_categories(categories, **kwargs) log.update(re04) elif cats_gen == 'cosine' or (cats_gen == 'auto' and clustering == 'kmeans'): #TODO: vectors g12n gen_cats = categories log.update({'generalization': 'vector-similarity based - #TODO'}) if verbose == 'debug': print( '#TODO: categories generalization based on cosine similarity') else: gen_cats = categories log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)}) if verbose in ['max', 'debug']: print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \ cats_gen, '⇒ gen_cats = categories') # Save 1st cats_file = to control 2-step generalization #FIXME:DEL? #80704 #-re05 = save_cat_tree(gen_cats, output_categories, verbose) #-log.update({'category_tree_file': re05['cat_tree_file']}) # Save cats.pkl #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704 #- pickle.dump(gen_cats, f) #-if verbose in ['max','debug']: #- print(UTC(),':: learn_grammar: 1st cat_tree saved') # Learn grammar #80623 if grammar_rules != context: context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)') links, re06 = files2links(**kwargs) kwargs['context'] = context # add disjuncts to categories {} after k-means clustering #TOEO: speed! def add_disjuncts(cats, links, verbose='none'): if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']), 'clusters') from copy import deepcopy top_clusters = [i for i,x in enumerate(cats['cluster']) \ if i > 0 and x is not None] word_clusters = dict() for i in top_clusters: for word in cats['words'][i]: word_clusters[word] = i if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: word_clusters:', len(word_clusters), 'words') df = links.copy() if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: df[links] = [[x] for x in df[link]]') df['links'] = [[x] for x in df['link']] if verbose in ['max', 'debug']: print( UTC(), ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])' ) df['cluster'] = df['word'].apply(lambda x: word_clusters[x]) if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cdf = df.groupby("cluster").agg(...') cdf = df.groupby('cluster').agg({ 'links': 'sum', 'count': 'sum' }).reset_index() #TODO? del df[...] to free RAM? disjuncts = [[]] + cdf['links'].tolist() counts = [0] + cdf['count'].tolist() if verbose in ['max', 'debug']: print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \ len(rules['cluster']), len(disjuncts), '\ncounts:', counts) fat_cats = deepcopy(cats) fat_cats['counts'] = [0] + cdf['count'].tolist() fat_cats['disjuncts'] = [[]] + cdf['links'].tolist() #['djs'] djset = set() [[djset.add(y) for y in x] for x in fat_cats['disjuncts']] djlist = sorted(djset) fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in fat_cats['disjuncts']] if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: return fat_cats') return fat_cats #TODO: def djs? vectors(disjuncts, **kwargs) #if context < 2 and grammar_rules > 1: if word_space == 'vectors' or clustering == 'kmeans': if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ add_disjuncts') #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601 # pickle.dump(gen_cats, f) fat_cats = add_disjuncts(gen_cats, links) if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar: back from add_disjuncts') #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs) #TODO: else: fat_cats = gen_cats # Learn Grammar #+from src.grammar_learner.poc05 import induce_grammar rules, re07 = induce_grammar(fat_cats, links) if verbose == 'debug': print('induce_grammar ⇒ rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(rules))])) # Generalize grammar rules gen_rules = rules if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) from src.grammar_learner.generalization import generalize_rules gen_rules, re08 = generalize_rules(rules, **kwargs) log.update(re08) if verbose == 'debug': print('generalize_rules ⇒ gen_rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(gen_rules))])) # Save cat_tree.txt file #^from src.utl.write_files import save_cat_tree re09 = save_cat_tree(gen_rules, output_categories, verbose='none') #FIXME: verbose? #TODO: check file save error? log.update(re09) # Save Link Grammar .dict re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules) log.update(re10) log.update({'finish': str(UTC())}) #TODO: elapsed execution time? Save log? return log
def induce_grammar(categories, links, verbose='none'): #80620 learn_grammar replacement # categories: {'cluster': [], 'words': [], ...} # links: pd.DataFrame (legacy) from src.grammar_learner.generalization import cats2list import copy rules = copy.deepcopy(categories) clusters = [ i for i, x in enumerate(rules['cluster']) if i > 0 and x is not None ] word_clusters = dict() for i in clusters: for word in rules['words'][i]: word_clusters[word] = i if verbose == 'debug': print('induce_grammar: rules.keys():', rules.keys()) print('induce_grammar: clusters:', clusters) print('induce_grammar: word_clusters:', word_clusters) print('induce_grammar: rules ~ categories:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(rules)) if i < 4])) for cluster in clusters: djs = [] for rule in categories['disjuncts'][ cluster]: #FIXME: categories ⇒ rules 80621 # 'a- & was-' ⇒ (-9,-26) #+TODO? (-x,-y,z) ⇒ (-x,z), (-y,z) ? if type(rule) is str: x = rule.split() dj = [] for y in x: if y not in ['&', ' ', '']: if y[-1] == '+': dj.append(word_clusters[y[:-1]]) elif y[-1] == '-': dj.append(-1 * word_clusters[y[:-1]]) else: print('no sign?', dj) #TODO:ERROR? djs.append(tuple(dj)) if verbose == 'debug': print('induce_gramma: cluster', cluster, '::', rule, '⇒', tuple(dj)) #TODO? +elif type(rule) is tuple? connectors - tuples? rules['disjuncts'][cluster] = set(djs) if verbose == 'debug': print('induce_grammar: rules["disjuncts"][' + str(cluster) + ']', rules['disjuncts'][cluster]) #rules['djs'] = copy.deepcopy(rules['disjuncts']) #TODO: check jaccard with tuples else replace with numbers if verbose == 'debug': print('induce_grammar: updated disjuncts:') from IPython.display import display from src.utl.widgets import html_table display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(rules)) if i < 32])) return rules, {'learned_rules': \ len([x for i,x in enumerate(rules['parent']) if x==0 and i>0]), \ 'total_clusters': len(rules['cluster']) - 1}