def print_kwargs(**kwargs): from src.utl.utl import UTC print('poc04 learn_grammar kwargs:') for k, v in kwargs.items(): print(('- ' + k + ': ')[:20], v) kwargs['printed'] = str(UTC()) return kwargs
def test_turtle_generalize_both(self): base = module_path + '/tests/data/POC-Turtle/' + \ 'generalized_categories_and_rules/dict_6C_2018-07-06_0005.4.0.dict' #'generalized_categories_and_rules/poc-turtle_6C_2018-06-08_0004.4.0.dict' input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' prj_dir = batch_dir + 'generalized_categories_and_rules/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir kwargs = { 'left_wall': 'LEFT-WALL', 'period': True, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'jaccard', 'rules_generalization': 'jaccard', 'tmpath': module_path + '/tmp/', 'verbose': 'none' } response = learn_grammar(input_parses, outpath, outpath, **kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def save_cat_tree(cats, output_categories, verbose='none'): #80706 0.5 #80611 ~ cats2list without 'djs', children'... # cats: {'cluster':[], 'words':[], ...} #80609 from copy import deepcopy from src.utl.write_files import list2file from src.utl.utl import UTC tree_file = output_categories if '.' not in tree_file: #auto file name if tree_file[-1] != '/': tree_file += '/' #-tree_file += (str(len(set([x[0] for x in cats_list]))) + '_cat_tree.txt') n_cats = len( [x for i, x in enumerate(cats['parent']) if i > 0 and x < 1]) tree_file += (str(n_cats) + '_cat_tree.txt') categories = [] for i, cluster in enumerate(cats['cluster']): if i == 0: continue category = [] if cats['cluster'][i] is not None: category.append(cats['cluster'][i]) else: category.append('') category.append(cats['parent'][i]) category.append(i) category.append(round(cats['quality'][i], 2)) #!category.append(sorted(cats['words'][i])) #80704+06 tmp hack FIXME wordz = deepcopy(sorted(cats['words'][i])) #-80704 word@1, word@2 ⇒ word.a, word.b: #-wordz = [x.replace('@1','.a') for x in wordz] #-wordz = [x.replace('@2','.b') for x in wordz] #-wordz = [x.replace('@3','.c') for x in wordz] wordz = [x.replace('@', '.') for x in wordz] #80706 WSD: word@1 ⇒ word.1 category.append(wordz) #80704+06 tmp hack FIXME #80704+06 end category.append(cats['similarities'][i]) #-category.append(cats['children'][i]) categories.append(category) string = list2file(categories, tree_file) if verbose in ['max', 'debug']: print(UTC(),':: src/utl.writefiles.py save_cat_tree:', \ len(cats['cluster']) - 1, 'categories') if verbose == 'debug': from src.utl.widgets import html_table from IPython.display import display display( html_table( [['Code', 'Parent', 'Id', 'Sim', 'Words', 'Similarities']] + categories)) return {'cat_tree_file': tree_file}
def add_disjuncts(cats, links, verbose='none'): if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']), 'clusters') from copy import deepcopy top_clusters = [i for i,x in enumerate(cats['cluster']) \ if i > 0 and x is not None] word_clusters = dict() for i in top_clusters: for word in cats['words'][i]: word_clusters[word] = i if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: word_clusters:', len(word_clusters), 'words') df = links.copy() if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: df[links] = [[x] for x in df[link]]') df['links'] = [[x] for x in df['link']] if verbose in ['max', 'debug']: print( UTC(), ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])' ) df['cluster'] = df['word'].apply(lambda x: word_clusters[x]) if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cdf = df.groupby("cluster").agg(...') cdf = df.groupby('cluster').agg({ 'links': 'sum', 'count': 'sum' }).reset_index() #TODO? del df[...] to free RAM? disjuncts = [[]] + cdf['links'].tolist() counts = [0] + cdf['count'].tolist() if verbose in ['max', 'debug']: print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \ len(rules['cluster']), len(disjuncts), '\ncounts:', counts) fat_cats = deepcopy(cats) fat_cats['counts'] = [0] + cdf['count'].tolist() fat_cats['disjuncts'] = [[]] + cdf['links'].tolist() #['djs'] djset = set() [[djset.add(y) for y in x] for x in fat_cats['disjuncts']] djlist = sorted(djset) fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in fat_cats['disjuncts']] if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: return fat_cats') return fat_cats
def test_turtle_diled(self): corpus = 'POC-Turtle' dataset = 'MST_fixed_manually' input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' base = module_path + '/tests/data/POC-Turtle/' + \ '/2018-04-25/turtle_dILEd_LW+dot+_2018-04-25_0008.4.0.dict' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' prj_dir = batch_dir + 'Turtle_dILEd_LW_and_period/' if check_dir(prj_dir, create=True, verbose='max'): output_categories = prj_dir output_grammar = prj_dir kwargs = { 'left_wall': 'LEFT-WALL', 'period': True, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'verbose': 'min' } response = learn_grammar(input_parses, output_categories, output_grammar, **kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): if kwargs['verbose'] == 'debug': print('\nTest results vs baseline:') for i, rule in enumerate(base_list): print(rule_list[i]) print(rule) assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') #80509 kwargs tests ~OK #-print('poc04 learn_grammar kwargs:') #-for k,v in kwargs.items(): print(('- '+k+': ')[:20], v) #-response = print_kwargs(**kwargs) #-return response #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os #, collections import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc04 import files2links #+from src.link_grammar.poc04 import category_learner from src.clustering.poc04 import clusters2dict #+from src.link_grammar.poc04 import grammar_learner #-from src.link_grammar.poc import save_link_grammar from src.utl.write_files import list2file, save_link_grammar from src.utl.widgets import html_table, plot2d from collections import OrderedDict log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'}) #log.update({'datime': str(UTC()), 'learn_grammar': '80510'}) files, re01 = check_mst_files(input_parses, verbose) log.update(re01) #for file in files: copy(file, output_categories) #TODO: output_categories file ⇒ dir if os.path.isdir(output_categories): parse_dir = output_categories + '/parses/' else: parse_dir = os.path.dirname(output_categories) + '/parses/' if check_dir(parse_dir, True, verbose): for file in files: copy(file, os.path.dirname(parse_dir)) else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files links, re02 = files2links(**kwargs) log.update(re02) if verbose == 'debug': print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) category_list, re03 = category_learner(links, **kwargs) log.update(re03) word_clusters = clusters2dict(category_list) # Save 1st cats_file = to control 2-step generalization #FIXME:DEL cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) #...TODO... hierarchical categories 80523 snooze #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ #... + category_list)) if grammar_rules != context: #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose) context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] links, re04 = files2links(**kwargs) kwargs['context'] = context rule_list, re05 = grammar_learner(word_clusters, links, **kwargs) log.update(re05) #...display(html_table([['Rules','','','','','']] + rule_list)) if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) cats_list, re06 = generalise_rules(rule_list, **kwargs) #TODO: = generalise_rules(rule_list, **kwargs) log.update(re06) if len(set([x[0] for x in cats_list])) < len( set([x[0] for x in category_list])): category_list = cats_list # Save 2nd cats_file - overwrite in case of equal cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) word_clusters = clusters2dict(category_list) rule_list, re07 = grammar_learner(word_clusters, links, **kwargs) #...display(html_table([['Rules','','','','','']] + rule_list)) log.update(re07) if verbose == 'debug': print('\nrules_generalisation ⇒ category_list:', category_list) if verbose not in ['min', 'none']: display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ + category_list)) # Save cat_tree.txt file from src.utl.write_files import save_category_tree tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt' re08 = save_category_tree(category_list, tree_file, verbose) #FIXME: verbose? log.update(re08) # Save Link Grammar .dict re09 = save_link_grammar(rule_list, output_grammar) log.update(re09) return log
def setUp(self): #FIXME: should run before every test, but would not?! #-import os, sys # Paths #FIXME: don't run? #module_path = os.path.abspath(os.path.join('..')) #if module_path not in sys.path: # sys.path.append(module_path) # Imports - moved up: #FIXME: don't run here? #-from src.grammar_learner.poc05 import learn_grammar #-from src.utl.utl import UTC src_path = module_path + '/src' if os.path.exists(src_path) and src_path not in sys.path: sys.path.append(src_path) # Don't need link grammar paths with new (June 2018) Grammar Tester (?) #-lg_path = '/home/oleg/miniconda3/envs/ull4/lib/python3.6/site-packages/linkgrammar' #-if os.path.exists(lg_path) and lg_path not in sys.path: #- sys.path.append(lg_path) #-link_grammar_path = module_path + '/src/link_grammar' #-if os.path.exists(link_grammar_path) and link_grammar_path not in sys.path: #- sys.path.append(link_grammar_path) input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' # Grammar Learner 0.5 parameters: # input_parses, output_categories, output_grammar, **kwargs kwargs = { # defaults #FIXME: don't pass to tests (should?) 'parse_mode': 'given', # 'given' (default) / 'explosive' (next) 'left_wall': 'LEFT-WALL', # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL### 'period': True, # use period in links learning: True/False 'context': 2, # 1: connectors / 2,3...: disjuncts 'window': 'mst', # 'mst' / reserved options for «explosive» parsing 'weighting': 'ppmi', # 'ppmi' / future options 'group': True, # group items after link parsing 'distance': False, # reserved options for «explosive» parsing 'word_space': 'discrete', # 'vectors' / 'discrete' - no dimensionality reduction 'dim_max': 100, # max vector space dimensionality 'sv_min': 0.1, # minimal singular value (fraction of the max value) 'dim_reduction': 'none', # 'svm' / 'none' (discrete word_space, group) 'clustering': 'group', # 'kmeans' / 'group'~'identical_entries' / future options 'cluster_range': (2, 48, 1), # min, max, step 'cluster_criteria': 'silhouette', # optimal clustering criteria 'cluster_level': 0.9, # level = 0, 1, 0.-0.99..: 0 - max number of clusters 'categories_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'categories_merge': 0.8, # merge categories with similarity > this 'merge' criteria 'categories_aggregation': 0.2, # aggregate categories with similarity > this criteria 'grammar_rules': 2, # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?) 'rules_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'rules_merge': 0.8, # merge rules with similarity > this 'merge' criteria 'rules_aggregation': 0.2, # aggregate rules similarity > this criteria 'tmpath': module_path + '/tmp/', 'verbose': 'min', # display intermediate results: 'none', 'min', 'mid', 'max' # Additional (optional) parameters for parse_metrics (_abiity & _quality): 'test_corpus': module_path + '/data/POC-Turtle/poc-turtle-corpus.txt', 'reference_path': module_path + '/data/POC-Turtle/poc-turtle-parses-expected.txt', 'template_path': 'poc-turtle', #FIXME: changed in June 2018 Grammar Tester 'linkage_limit': 1 } pass
def category_learner(links, \ cat_path, dict_path, tmpath = '', verbose = 'none', \ parse_mode = 'given', left_wall = '', period = False, \ context = 1, window = 'mst', weighting = 'ppmi', group = True, \ word_space = 'vectors', dim_max = 100, sv_min = 0.1, dim_reduction = 'svm', \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9): #, \ #-generalization = 'off', merge = 0.8, aggregate = 0.2, grammar_rules = 1): from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc03 import number_of_clusters, clusters2list #80422 from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Grammar Learner v.0.3 2018-04-11', \ 'date': str(UTC()), 'project_dir': dict_path} '''TODO: log: dict ⇒ list [[]]? / OrderedDict?''' if tmpath == '': tmpath = dict_path # temporary files path if verbose == 'debug': print('category_learner: word_space:', word_space, '/ clustering:', clustering) if word_space == 'vectors': #^from src.space.hyperwords import vector_space_dim, pmisvd #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt vdf, sv, res3 = pmisvd(links, tmpath, tmpath, dim) log.update(res3) elif verbose in ['max', 'debug']: print('category_learner: word_space:', word_space, '/ clustering:', clustering) if clustering == 'kmeans': #^from src.clustering.kmeans import cluster_words_kmeans #^from src.clustering.poc03 import number_of_clusters, clusters2list n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose not in ['min', 'none']: print('/poc03/category_learner: number of clusters =', n_clusters) if verbose in ['max', 'debug']: print(clusters.applymap(round2)) if verbose in ['max', 'debug']: #80412 hack: plots for AGI-2018 :( if context == 1: #FIXME:DEL? plot2d(1, 2, clusters, 'cluster_words', 10) else: if len(clusters) < 6: plot2d(1, 3, clusters, 'cluster_words', 10) else: plot2d(1, 4, clusters, 'cluster_words', 10) elif clustering[:5] in ['group', 'ident']: if verbose in ['max', 'debug']: print('clustering:', clustering) #TODO: from src.clustering.grouping import group_links clusters = group_links(links, verbose) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) if verbose in ['max', 'debug']: print('\n', clusters[['cluster_words', 'disjuncts']]) #.head(12)) # Generalization #TODO next week # Save categories #^from src.clustering.poc import clusters2list #^from src.utl.write_files import list2file category_list = clusters2list(clusters) if verbose not in ['min', 'none']: display(html_table([['Parent','Category','Quality','Words','Relevance']] \ + category_list)) '''TODO: categories file name''' if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + str(len(clusters)) + '_categories.txt' categories = list2file(category_list, cat_file) '''TODO: category file path ⇒ log''' if verbose in ['debug']: print('\nWord categories:\n') for line in categories.splitlines()[:3]: print(line) if verbose not in ['min', 'none']: print('\nCategory list -', len(categories.splitlines()), 'lines, saved to', cat_file) return category_list, log
def save_link_grammar(rules, output_grammar, grammar_rules=2, header='', footer=''): #80626 # rules: [] or {} - # grammar_rules = kwargs['grammar_rules']: 1 ⇒ connectors, 2+ ⇒ disjuncts import os from ..utl.utl import UTC #-if path[-1] != '/': path += '/' if type( rules ) is dict: #80620 0.5 new data structure, 80626 connector-based rules rules = rules2list(rules, grammar_rules) link_grammar = '' #80510 0.4 line_list = list() clusters = set() for rule in rules: line = '' if len(rule[2]) > 0 and len(rule[3]) > 0: line += '{' + ' or '.join(str(x) for x in rule[2]) \ + '} & {' + ' or '.join(str(y) for y in rule[3]) + '}' else: if len(rule[2]) > 0: line += ' or '.join('(' + str(x) + ')' for x in rule[2]) elif len(rule[3]) > 0: line += ' or '.join('(' + str(x) + ')' for x in rule[3]) if len(rule[4]) > 0: if line != '': line += ' or ' line += ' or '.join('(' + str(x) + ')' for x in rule[4]) cluster_number = '% ' + str(rule[0]) + '\n' # comment line: cluster cluster_and_words = ' '.join('"' + word + '"' for word in rule[1]) + ':\n' line_list.append(cluster_number + cluster_and_words + line + ';\n') clusters.add(rule[0]) line_list.sort() #FIXME: overkill? #TODO: file naming - corpus name? #-if file != '': out_file = path + file if os.path.isfile(output_grammar): out_file = output_grammar elif os.path.isdir(output_grammar): out_file = output_grammar if out_file[-1] != '/': out_file += '/' #-if 'isa' in '\t'.join(line_list): out_file += 'poc-turtle_' #-else: out_file += 'poc-english_' #out_file += 'poc-english_' #80704 replaced with: out_file += 'dict_' out_file = out_file + str(len(clusters)) + 'C_' \ + str(UTC())[:10] + '_0005.4.0.dict' #80620 0004⇒0005 else: raise FileNotFoundError('File not found', output_grammar) if header == '': header = '% Grammar Learner v.0.5 ' + str(UTC()) #80620 .4⇒.5 header = header + '\n' + '<dictionary-version-number>: V0v0v5+;\n' \ + '<dictionary-locale>: EN4us+;' add_rules = 'UNKNOWN-WORD: XXX+;' if footer == '': footer = '% '+ str(len(clusters)) + ' word clusters, ' \ + str(len(rules)) + ' Link Grammar rules.\n' \ + '% Link Grammar file saved to: ' + out_file lg = header + '\n\n' + '\n'.join( line_list) + '\n' + add_rules + '\n\n' + footer #-80704 tmp FIXME: #-lg = lg.replace('@1', '.a') #-lg = lg.replace('@2', '.b') #-lg = lg.replace('@3', '.c') lg = lg.replace('@', '.') #8070 WSD: word@1 ⇒ word.1 with open(out_file, 'w') as f: f.write(lg) from collections import OrderedDict response = OrderedDict({'grammar_file': out_file}) response.update({ 'grammar_clusters': len(clusters), 'grammar_rules': len(rules) }) return response
def learn_connectors(input_dir, cat_path, dict_path, verbose='none', \ parse_mode='given', \ word_space = 'hyperwords', dim_max = 100, sv_min = 0.1, \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = ''): from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.utl.read_files import check_mst_files from src.space.poc import files2links from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc import number_of_clusters, clusters2list from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Unified Grammar Learner: Clustering words', \ 'date': str(UTC()), 'project_dir': dict_path, 'input_dir': input_dir } """TODO: dict ⇒ list [[]] / OrderedDict?""" files, response = check_mst_files(input_dir, verbose='none') links = files2links(files, parse_mode='given', context=1, group = True, \ left_wall='LEFT-WALL', period=True, verbose='none') # vector_space_dim(links, path, tmpath, dim_max=100, sv_min=0.9, 'max') if tmpath == '': tmpath = dict_path dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose not in ['none', 'min']: print('Optimal vector space dimensionality:', dim) vdf, sv, res2 = pmisvd(links, dict_path, tmpath, dim) log.update(res2) #-vdf.applymap(round2).sort_values(by=[1,2,3], ascending=[False,False,False]) n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) if verbose not in ['none', 'min']: print('Optimal number of clusters:', n_clusters) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose in ['max', 'debug']: plot2d(1, 2, clusters, 'cluster_words', 10) # Generalisation - just histogram? - Grammar-Learner-Clustering-Words 2.6 import numpy as np from src.clustering.similarity import cluster_similarity sim_df, res3 = cluster_similarity(clusters, 'max') log.update(res3) if verbose in ['max', 'debug']: count, division = np.histogram(sim_df['similarity']) sim_df['similarity'].hist(bins=division) print('Cluster similarities: absolute values > 0.1:') sim_df.sort_values(by='similarity', ascending=False).loc[(sim_df['similarity']) > 0.1] # Save categories category_list = clusters2list(clusters) if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) if verbose in ['max', 'debug']: for line in categories.splitlines(): print(line) print('<...>\nTotal', len(categories.splitlines()), \ 'lines, saved to', cat_file) #-print(len(categories.splitlines()), 'categories saved to', cat_file) # Grammar Learner lg_rule_list = grammar_learner(clusters, links, verbose) if verbose == 'max': display( html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] + lg_rule_list)) lg_rules_str = save_link_grammar(lg_rule_list, dict_path) if verbose == 'max': for line in lg_rules_str.splitlines(): print(line) #-return categories, lg_rules_dict #TODO: return paths to categories and dict? s = lg_rules_str.splitlines()[-1] lg_file = s[s.find(': ') + 2:] response = {'categories_file': cat_file, 'grammar_file': lg_file} return response
def category_learner(links, \ cat_path, dict_path, verbose='none', \ parse_mode='given', \ word_space = 'vectors', dim_max = 100, sv_min = 0.1, \ dim_reduction = 'svm', \ clustering = 'kmeans', cluster_range = (2,48,1), \ cluster_criteria = 'silhouette', cluster_level = 0.9, tmpath = '', generalization = 'off', grammar_rules = 'connectors'): # no actual need need for grammar rules here? from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc import number_of_clusters, clusters2list from src.utl.turtle import html_table, plot2d from src.utl.write_files import list2file from src.link_grammar.poc import save_link_grammar log = {'project': 'Grammar Learner v.0.2 2018-04-06', \ 'date': str(UTC()), 'project_dir': dict_path} '''TODO: log: dict ⇒ list [[]]? / OrderedDict?''' if word_space == 'vectors': if tmpath == '': tmpath = dict_path #^from src.space.hyperwords import vector_space_dim, pmisvd dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) log.update(res3) else: #TODO: word_space = 'discrete'... if tmpath == '': tmpath = dict_path dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose == 'min': print('Optimal vector space dimensionality:', dim) vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) log.update(res3) # Clustering #-clustering = 'group' if clustering == 'kmeans': #^from src.clustering.poc import number_of_clusters, clusters2list n_clusters = number_of_clusters(vdf, cluster_range ,clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) if verbose not in ['min', 'none']: print('Optimal number of clusters:', n_clusters) if verbose == 'max': plot2d(1, 2, clusters, 'cluster_words', 10) elif clustering[:5] in ['group', 'ident']: #80606 test ~OK def group_links(links): #+TODO: old code ⇒ here ⇒ src.clustering.group_links.py #-Old way: from src.link_grammar.turtle import lexical_entries, entries2clusters djs = links.rename(columns={'link': 'disjunct'}) #-clusters = entries2clusters(lexical_entries(djs)) entries = lexical_entries(djs) clusters = entries2clusters(entries).rename( columns={'germs': 'cluster_words'}) return clusters #+from ... import group links clusters = group_links(links) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) if verbose == 'max': print('\n', clusters[['cluster_words', 'disjuncts']]) #.head(12)) # Generalization #TODO next week # Save categories #^from src.clustering.poc import clusters2list #^from src.utl.write_files import list2file category_list = clusters2list(clusters) if verbose not in ['min', 'none']: display(html_table([['Parent','Category','Quality','Words','Relevance']] \ + category_list)) '''TODO: categories file name''' if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) '''TODO: category file path ⇒ log''' if verbose == 'max': for line in categories.splitlines()[:3]: print(line) if verbose != 'none': print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to', cat_file) return category_list, log
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') print('learn_grammar: grammar_rules:', grammar_rules) #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os, pickle #, collections from collections import OrderedDict import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc05 import files2links #80528 .poc05 from src.clustering.poc05 import clusters2dict #+from src.link_grammar.poc05 import category_learner #+from src.link_grammar.poc05 import induce_grammar from src.utl.write_files import list2file, save_link_grammar, save_cat_tree from src.utl.widgets import html_table, plot2d from src.grammar_learner.generalization import generalize_categories, \ reorder, cats2list, generalize_rules #, aggregate, aggregate_word_categories\ log = OrderedDict({'start': str(UTC()), 'learn_grammar': '80605'}) #TODO: save kwargs? files, re01 = check_mst_files(input_parses, verbose) log.update(re01) if os.path.isdir(output_categories): prj_dir = output_categories else: prj_dir = os.path.dirname(output_categories) log.update({'project_directory': prj_dir}) #-Save a copy of input parses to prj_dir + '/parses/' #FIXME:DEL? #80704 #-parse_dir = prj_dir + '/parses/' #-if check_dir(parse_dir, True, verbose): #- for file in files: copy(file, os.path.dirname(parse_dir)) #-else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files # files ⇒ links: links, re02 = files2links(**kwargs) log.update(re02) # corpus_stats - implanted in files2links 80605 list2file(re02['corpus_stats'], prj_dir + '/corpus_stats.txt') log.update({'corpus_stats_file': prj_dir + '/corpus_stats.txt'}) if verbose in ['max', 'debug']: print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) # Learn categories: new 80619 categories, re03 = category_learner(links, **kwargs) #v.0.5 categories: {} log.update(re03) # Generalize categories #TODO? "gen_cats" ⇒ "categories"? no new name if cats_gen == 'jaccard' or (cats_gen == 'auto' and clustering == 'group'): if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ generalize_categories (jaccard)') gen_cats, re04 = generalize_categories(categories, **kwargs) log.update(re04) elif cats_gen == 'cosine' or (cats_gen == 'auto' and clustering == 'kmeans'): #TODO: vectors g12n gen_cats = categories log.update({'generalization': 'vector-similarity based - #TODO'}) if verbose == 'debug': print( '#TODO: categories generalization based on cosine similarity') else: gen_cats = categories log.update({'generalization': 'error: cats_gen = ' + str(cats_gen)}) if verbose in ['max', 'debug']: print(UTC(),':: learn_grammar: generalization: else: cats_gen =', \ cats_gen, '⇒ gen_cats = categories') # Save 1st cats_file = to control 2-step generalization #FIXME:DEL? #80704 #-re05 = save_cat_tree(gen_cats, output_categories, verbose) #-log.update({'category_tree_file': re05['cat_tree_file']}) # Save cats.pkl #-with open(re05['cat_tree_file'][:-3]+'pkl', 'wb') as f: #FIXME:DEL? #80704 #- pickle.dump(gen_cats, f) #-if verbose in ['max','debug']: #- print(UTC(),':: learn_grammar: 1st cat_tree saved') # Learn grammar #80623 if grammar_rules != context: context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ files2links(**kwargs)') links, re06 = files2links(**kwargs) kwargs['context'] = context # add disjuncts to categories {} after k-means clustering #TOEO: speed! def add_disjuncts(cats, links, verbose='none'): if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cats:', len(cats['cluster']), 'clusters') from copy import deepcopy top_clusters = [i for i,x in enumerate(cats['cluster']) \ if i > 0 and x is not None] word_clusters = dict() for i in top_clusters: for word in cats['words'][i]: word_clusters[word] = i if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: word_clusters:', len(word_clusters), 'words') df = links.copy() if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: df[links] = [[x] for x in df[link]]') df['links'] = [[x] for x in df['link']] if verbose in ['max', 'debug']: print( UTC(), ':: add_disjuncts: df[cluster] = df[word].apply(lambda x: word_clusters[x])' ) df['cluster'] = df['word'].apply(lambda x: word_clusters[x]) if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: cdf = df.groupby("cluster").agg(...') cdf = df.groupby('cluster').agg({ 'links': 'sum', 'count': 'sum' }).reset_index() #TODO? del df[...] to free RAM? disjuncts = [[]] + cdf['links'].tolist() counts = [0] + cdf['count'].tolist() if verbose in ['max', 'debug']: print(UTC(),':: add_disjuncts: len(cluster, disjuncts):', \ len(rules['cluster']), len(disjuncts), '\ncounts:', counts) fat_cats = deepcopy(cats) fat_cats['counts'] = [0] + cdf['count'].tolist() fat_cats['disjuncts'] = [[]] + cdf['links'].tolist() #['djs'] djset = set() [[djset.add(y) for y in x] for x in fat_cats['disjuncts']] djlist = sorted(djset) fat_cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in fat_cats['disjuncts']] if verbose in ['max', 'debug']: print(UTC(), ':: add_disjuncts: return fat_cats') return fat_cats #TODO: def djs? vectors(disjuncts, **kwargs) #if context < 2 and grammar_rules > 1: if word_space == 'vectors' or clustering == 'kmeans': if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar ⇒ add_disjuncts') #with open(re05['cat_tree_file'][:-9]+'s.pkl', 'wb') as f: #FIXME:DEL tmp 80601 # pickle.dump(gen_cats, f) fat_cats = add_disjuncts(gen_cats, links) if verbose in ['max', 'debug']: print(UTC(), ':: learn_grammar: back from add_disjuncts') #TODO: fat_cats['djs'] = djs(fat_cats[disjuncts], **kwargs) #TODO: else: fat_cats = gen_cats # Learn Grammar #+from src.grammar_learner.poc05 import induce_grammar rules, re07 = induce_grammar(fat_cats, links) if verbose == 'debug': print('induce_grammar ⇒ rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(rules))])) # Generalize grammar rules gen_rules = rules if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) from src.grammar_learner.generalization import generalize_rules gen_rules, re08 = generalize_rules(rules, **kwargs) log.update(re08) if verbose == 'debug': print('generalize_rules ⇒ gen_rules:') display(html_table([['Code','Parent','Id','Quality','Words', 'Disjuncts', 'djs','Relevance','Children']] \ + [x for i,x in enumerate(cats2list(gen_rules))])) # Save cat_tree.txt file #^from src.utl.write_files import save_cat_tree re09 = save_cat_tree(gen_rules, output_categories, verbose='none') #FIXME: verbose? #TODO: check file save error? log.update(re09) # Save Link Grammar .dict re10 = save_link_grammar(gen_rules, output_grammar, grammar_rules) log.update(re10) log.update({'finish': str(UTC())}) #TODO: elapsed execution time? Save log? return log
def category_learner(links, **kwargs): #80619 POC.0.5 # links - DataFrame ['word', 'link', 'count'] def kwa(v, k): return kwargs[k] if k in kwargs else v #-links = kwargs['links'] # links - check? cats_file = kwa('/output', 'output_categories') # to define tmpath #-dict_path = kwa('/output', 'output_grammar') # not used here tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') cluster_range = kwa((2, 48, 1), 'cluster_range') cluster_criteria = kwa('silhouette', 'cluster_criteria') cluster_level = kwa(0.9, 'cluster_level') generalization = kwa('off', 'categories_generalization') merge = kwa(0.8, 'categories_merge') aggregate = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') verbose = kwa('none', 'verbose') from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc05 import number_of_clusters, clusters2list from src.utl.widgets import html_table, plot2d from src.utl.read_files import check_dir #, check_mst_files from src.utl.write_files import list2file, save_link_grammar #-from src.grammar_learner.poc05 import group_links, \ #- aggregate_cosine, aggregate_jaccard, aggregate_word_categories from collections import OrderedDict log = OrderedDict() log.update({'category_learner': '80619'}) if tmpath == '' or tmpath == 'auto': # temporary files path if '.' not in cats_file: tmpath = cats_file else: tmpath = cats_file[:cats_file.rindex('/')] if tmpath[-1] != '/': tmpath += '/' tmpath += 'tmp/' print('tmpath:', tmpath) if check_dir(tmpath, True, verbose): log.update({'tmpath': tmpath}) #TODO:ERROR if verbose == 'debug': print('category_learner: word_space:', word_space, '/ clustering:', clustering) #-if word_space == 'vectors': #80619 Category-Tree-2018-06-19.ipynb if context == 1 or word_space[0] in ['v', 'e'] or clustering == 'kmeans': #word_space options: v,e: 'vectors'='embeddings', d,w: 'discrete'='word_vectors' print('DRK: context =', str(context) + ', word_space: ' + word_space + ', clustering:', clustering) #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose in ['mid', 'max', 'debug']: print('Optimal vector space dimensionality:', dim) #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim) log.update(re01) #-if clustering == 'kmeans': if verbose in ['max', 'debug']: print(UTC(), ':: category_learner ⇒ number_of_clusters') n_clusters = number_of_clusters(vdf, cluster_range, clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) log.update({'n_clusters': n_clusters}) if verbose in ['max', 'debug']: print(UTC(), ':: category_learner ⇒ cluster_words_kmeans:', n_clusters, 'clusters') clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) log.update({'silhouette': silhouette, 'inertia': inertia}) #-elif clustering[:5] in ['group','ident']: else: if verbose in ['max', 'debug']: print(UTC(),':: category_learner ⇒ iLE group_links: context =', \ str(context)+', word_space: '+str(word_space)+', clustering:', clustering) #TODO: from src.clustering.grouping import group_links clusters = group_links(links, verbose) log.update({'n_clusters': len(clusters)}) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) # Convert clusters DataFrame ⇒ cats {} #80619 0.5 #TODO?: if clusters == pd.dataframe: if verbose in ['max', 'debug']: print(UTC(), ':: category_learner: convert clusters ⇒ cats {}') cats = {} #80609 dict instead of DataFrame cats['cluster'] = ['C0'] + clusters['cluster'].tolist() cats['parent'] = [0 for x in cats['cluster']] cats['words'] = [[]] + [set(x) for x in clusters['cluster_words'].tolist()] if 'disjuncts' in clusters: cats['disjuncts'] = [[]] + clusters['disjuncts'].tolist() djset = set() [[djset.add(y) for y in x] for x in cats['disjuncts']] djlist = sorted(djset) cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in cats['disjuncts']] if 'counts' in clusters: cats['counts'] = [0] + clusters['counts'].tolist() if word_space == 'vectors' or clustering == 'kmeans': cats['quality'] = [0 for x in cats['words']] cats['similarities'] = [[0 for y in x] for x in cats['words']] else: cats['quality'] = [1 for x in cats['words']] cats['quality'][0] = 0 cats['similarities'] = [[1 for y in x] for x in cats['words']] cats['similarities'][0] = [0] cats['children'] = [0 for x in cats['words']] return cats, log