def main(argv): """ Usage: python tstr.py config.json """ print('\nGrammar Tester v.' + __version__, 'started', UTC(), '| Python v.' + platform.python_version(), '\n') try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: print('''Usage: tstr <json-config-file>''') sys.exit() for opt in opts: if opt == '-h': print('''Usage: tstr <json-config-file>''') sys.exit() else: config_json = args[0] with open(config_json) as f: kwargs = json.load(f) re = {} if 'input_grammar' in kwargs: # Test .dict file # 90123 ig = module_path + kwargs['input_grammar'] og = module_path + kwargs['out_path'] # og: output grammar rp = module_path + kwargs['reference'] # rp: reference path if 'test_corpus' in kwargs: cp = module_path + kwargs['test_corpus'] # cp: corpus path else: cp = rp # test corpus path = reference parses path print('Input grammar:', ig, '\nOutput directory:', og) if check_dir(og, True, 'max'): print('Grammar test started', UTC(), '\n') start = time.time() a, f1, precision, q = pqa_meter(ig, og, cp, rp, **kwargs) re.update({'grammar_test_time': sec2string(time.time() - start)}) else: print('Output path error:', og) else: print('Please set "input grammar" in config.json') sys.exit() stats = [] if 'grammar_test_time' in re: stats.append(['Grammar test time ', re['grammar_test_time']]) if len(stats) > 0: list2file(stats, og + '/test_stats.txt') copy(config_json, og) #with open(re['project_directory'] + '/grammar_learner_log.json', 'w') as f: # f.write(json.dumps(re)) print('\nGrammar learning and the learned grammar test ended', UTC()) #print(test_stats(re)) #print('Output directory:', re['project_directory'], '\n') print(f'PA = {int(round(a*100,0))}%, PQ = {int(round(q*100,0))}%, ' f'F1 = {round(f1,2)}')
def main(argv): """ Usage: python ppln.py config.json """ print('\nGrammar Learner + Tester ppln v.' + __version__, 'started', UTC(), '| Python v.' + platform.python_version(), '\n') try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: print('''Usage: ppln <json-config-file>''') sys.exit() for opt in opts: if opt == '-h': print('''Usage: ppln <json-config-file>''') sys.exit() else: config_json = args[0] with open(config_json) as f: kwargs = json.load(f) corpus = kwargs['corpus'] del kwargs['corpus'] dataset = kwargs['dataset'] del kwargs['dataset'] if 'input_parses' not in kwargs: kwargs['input_parses'] = '/data/' + corpus + '/' + dataset line = [[0, corpus, dataset, 0, 0, kwargs['rules_generalization']]] out_path = module_path + kwargs['out_path'] rp = module_path + kwargs['reference'] if 'test_corpus' in kwargs: cp = module_path + kwargs['test_corpus'] else: cp = rp # test corpus path = reference parses path if 'tmpath' not in kwargs: kwargs['tmp_dir'] = '' else: if len(kwargs['tmpath']) == 0: kwargs['tmp_dir'] = '' else: if 'home' in kwargs['tmpath']: tmpath = kwargs['tmpath'] else: tmpath = module_path + kwargs['tmpath'] if check_dir(tmpath, True, 'none'): kwargs['tmp_dir'] = tmpath else: kwargs['tmp_dir'] = '' a, _, hdr, log, rules = wide_rows(line, out_path, cp, rp, (1, 1), **kwargs) copy(config_json, log['project_directory']) print('\nGrammar learning and the learned grammar test ended', UTC()) print(test_stats(log)) print('Output directory:', log['project_directory'], '\n')
def test_pqa_english_noamb_diled_no_generalization(self): input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/noamb_pqa_diled_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt' rp = input_parses + '/poc-english_noAmb-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_pqa_turtle_ddrkd_no_generalization(self): input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/turtle_pqa_ddrkd_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt' rp = input_parses + '/poc-turtle-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'vectors', 'dim_reduction': 'svd', 'clustering': ('kmeans', 'kmeans++', 18), #-'cluster_range' : (2,50,9) , 'cluster_range': (20, 2, 9), 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_pqa_turtle_diled_no_generalization(self): input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually' batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10] prj_dir = batch_dir + '/turtle_pqa_diled_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt' rp = input_parses + '/poc-turtle-parses-gold.txt' kwargs = { 'input_parses' : input_parses, 'output_grammar': outpath, 'left_wall' : '' , 'period' : False , 'context' : 2 , 'word_space' : 'discrete' , 'dim_reduction' : 'none' , 'clustering' : 'group' , 'grammar_rules' : 2 , 'categories_generalization' : 'off' , 'rules_generalization' : 'off' , 'tmpath' : module_path + '/tmp/', 'linkage_limit' : 1000, 'verbose' : 'min' } re = learn_grammar(**kwargs) # 81019 changes: # FIXME: DEL comments # a, q, qa = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) # print('parse-ability, parse-quality:', a, q) # assert a*q > 0.99 # self.assertTrue(a*q*Decimal("100") > 0.99, str(a) + " * " + str(q) + " * 100 !> 0.99") pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) # pa, f1, precision, recall: <float> 0.0 - 1.0 self.assertTrue(pa*recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_turtle_generalize_rules(self): base = module_path + '/tests/data/POC-Turtle/' + \ 'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict' input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10] prj_dir = batch_dir + '/turtle_lw_&_dot_generalized_rules/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir kwargs = { 'input_parses' : input_parses, 'output_grammar': outpath, 'left_wall' : 'LEFT-WALL' , 'period' : True , 'context' : 2 , 'word_space' : 'discrete' , 'dim_reduction' : 'none' , 'clustering' : 'group' , 'grammar_rules' : 2 , 'categories_generalization' : 'off' , 'rules_generalization' : 'jaccard', 'tmpath' : module_path + '/tmp/', 'verbose' : 'none' } response = learn_grammar(**kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def setUp(self): # FIXME: should run before every test, but would not?! input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' kwargs = { # defaults 'input_parses': input_parses, # path to directory with input parses 'output_grammar': batch_dir, # filename or path 'output_categories': '', # = output_grammar if '' or not set 'output_statistics': '', # = output_grammar if '' or not set 'temp_dir': '', # temporary files = language-learning/tmp if '' or not set 'parse_mode': 'given', # 'given' (default) / 'explosive' (next) 'left_wall': 'LEFT-WALL', # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL### 'period': True, # use period in links learning: True/False 'context': 2, # 1: connectors / 2,3...: disjuncts 'window': 'mst', # 'mst' / reserved options for «explosive» parsing 'weighting': 'ppmi', # 'ppmi' / future options 'group': True, # group items after link parsing 'distance': False, # reserved options for «explosive» parsing 'word_space': 'discrete', # 'vectors' / 'discrete' - no dimensionality reduction 'dim_max': 100, # max vector space dimensionality 'sv_min': 0.1, # minimal singular value (fraction of the max value) 'dim_reduction': 'none', # 'svm' / 'none' (discrete word_space, group) 'clustering': 'group', # 'kmeans' / 'group'~'identical_entries' / future options 'cluster_range': (2, 48, 1), # min, max, step 'cluster_criteria': 'silhouette', # optimal clustering criteria 'cluster_level': 1.0, # level = 0, 1, 0.-0.99..: 0 - max number of clusters 'categories_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'categories_merge': 0.8, # merge categories with similarity > this 'merge' criteria 'categories_aggregation': 0.2, # aggregate categories with similarity > this criteria 'grammar_rules': 2, # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?) 'rules_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'rules_merge': 0.8, # merge rules with similarity > this 'merge' criteria 'rules_aggregation': 0.2, # aggregate rules similarity > this criteria 'tmpath': module_path + '/tmp/', # legacy, default if not temp_dir 'verbose': 'min', # display intermediate results: 'none', 'min', 'mid', 'max' 'linkage_limit': 1000 # Link Grammar parameter for tests } # Additional (optional) parameters for parse_metrics (_abiity & _quality): # 'test_corpus': module_path + '/data/POC-Turtle/poc-turtle-corpus.txt', # 'reference_path': module_path + '/data/POC-Turtle/poc-turtle-parses-expected.txt', # 'template_path': 'poc-turtle', # FIXME: changed in June 2018 Grammar Tester pass
def test_pqa_english_noamb_ddrkd_no_generalization(self): input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/noamb_pqa_ddrkd_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt' rp = input_parses + '/poc-english_noAmb-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'vectors', 'dim_reduction': 'svd', 'clustering': ('kmeans', 'kmeans++', 18), 'cluster_range': (12, 12, 5), 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } # Sometimes pqa_meter(with test_grammar updated 2018-10-19) returns pa,recall = 0,0 # FIXME: check with further test_grammar updates and delete. x = 0. n = 0 while x < 0.1: re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) print( f'\nnoAmb dDRKd: pa {round(pa,3)}, f1 {round(f1,3)}, precision {round(precision,3)}, recall {round(recall,3)} \n' ) x = pa * recall n += 1 if n > 24: break self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
import unittest from decimal import Decimal module_path = os.path.abspath(os.path.join('.')) if module_path not in sys.path: sys.path.append(module_path) from src.grammar_learner.utl import UTC from src.grammar_learner.read_files import check_dir from src.grammar_learner.learner import learn_grammar from src.grammar_learner.pqa_table import pqa_meter # from ull.grammartest.optconst import * #base = module_path + '/tests/data/POC-Turtle/' + \ # 'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict' input_parses = module_path + '/tests/data/dataSymbols/dynsym/' batch_dir = module_path + '/output/test_dynsym_' + str(UTC())[:10] prj_dir = batch_dir + '/dynsym_rules_b/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir kwargs = {'input_parses': input_parses, 'output_grammar': outpath} response = learn_grammar(**kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] #with open(base, 'r') as f: lst = f.read().splitlines() #base_list = [line for line in lst if line[0:1] in ['"', '(']] #if len(rule_list) == len(base_list): # assert rule_list == base_list #else: # assert len(rule_list) == len(base_list), f"\nlen(rule_list)={len(rule_list)}" \ # f"\nlen(base_list)={len(base_list)}" \
def main(argv): """ Usage: python ppln.py config.json """ print('\nGrammar Learner + Tester ppln v.' + __version__, 'started', UTC(), '| Python v.' + platform.python_version(), '\n') try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: print('''Usage: ppln <json-config-file>''') sys.exit() for opt in opts: if opt == '-h': print('''Usage: ppln <json-config-file>''') sys.exit() else: config_json = args[0] with open(config_json) as f: kwargs = json.load(f) corpus = kwargs['corpus'] del kwargs['corpus'] dataset = kwargs['dataset'] del kwargs['dataset'] if 'input_parses' not in kwargs: kwargs[ 'input_parses'] = module_path + '/data/' + corpus + '/' + dataset else: if '/home/' in kwargs['input_parses']: kwargs['input_parses'] = kwargs['input_parses'] else: kwargs['input_parses'] = module_path + kwargs['input_parses'] if 'output_grammar' not in kwargs: if 'out_path' in kwargs: if '/home/' in kwargs['out_path']: kwargs['output_grammar'] = kwargs['out_path'] else: kwargs['output_grammar'] = module_path + kwargs['out_path'] else: print('Please set "output_grammar" or "out_path" in config.json') sys.exit() if 'tmpath' not in kwargs: kwargs['tmp_dir'] = '' else: if len(kwargs['tmpath']) == 0: kwargs['tmp_dir'] = '' else: if 'home' in kwargs['tmpath']: tmpath = kwargs['tmpath'] else: tmpath = module_path + kwargs['tmpath'] if check_dir(tmpath, True, 'none'): kwargs['tmp_dir'] = tmpath else: kwargs['tmp_dir'] = '' rules, re = learn(**kwargs) if 'error' in re: print('Grammar Learner error log:\n', re) sys.exit() if kwargs['linkage_limit'] > 0: og = module_path + kwargs['out_path'] rp = module_path + kwargs['reference'] if 'test_corpus' in kwargs: cp = module_path + kwargs['test_corpus'] else: cp = rp # test corpus path = reference parses path start = time.time() a, f1, precision, q = pqa_meter(re['grammar_file'], og, cp, rp, **kwargs) re.update({'grammar_test_time': sec2string(time.time() - start)}) stats = [] if 'grammar_learn_time' in re: stats.append(['Grammar learn time', re['grammar_learn_time']]) if 'grammar_test_time' in re: stats.append(['Grammar test time ', re['grammar_test_time']]) if len(stats) > 0: x = re['corpus_stats_file'] list2file(stats, x[:x.rfind('/')] + '/learn_&_test_stats.txt') copy(config_json, re['project_directory']) with open(re['project_directory'] + '/grammar_learner_log.json', 'w') as f: f.write(json.dumps(re)) print('\nGrammar learning and the learned grammar test ended', UTC()) print(test_stats(re)) print('Output directory:', re['project_directory'], '\n')