Example #1
0
 def test_pqa_english_noamb_diled_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/noamb_pqa_diled_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt'
     rp = input_parses + '/poc-english_noAmb-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     re = learn_grammar(**kwargs)
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp,
                                           rp, **kwargs)
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Example #2
0
 def test_pqa_turtle_ddrkd_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/turtle_pqa_ddrkd_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt'
     rp = input_parses + '/poc-turtle-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'vectors',
         'dim_reduction': 'svd',
         'clustering': ('kmeans', 'kmeans++', 18),
         #-'cluster_range' :   (2,50,9)    ,
         'cluster_range': (20, 2, 9),
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     re = learn_grammar(**kwargs)
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp,
                                           rp, **kwargs)
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Example #3
0
 def test_pqa_turtle_diled_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10]
     prj_dir = batch_dir + '/turtle_pqa_diled_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt'
     rp = input_parses + '/poc-turtle-parses-gold.txt'
     kwargs = {
         'input_parses'  :   input_parses,
         'output_grammar':   outpath,
         'left_wall'     :   '' ,
         'period'        :   False        ,
         'context'       :   2           ,
         'word_space'    :   'discrete'  ,
         'dim_reduction' :   'none'      ,
         'clustering'    :   'group'     ,
         'grammar_rules' :   2           ,
         'categories_generalization' :   'off' ,
         'rules_generalization'      :   'off' ,
         'tmpath'        :   module_path + '/tmp/',
         'linkage_limit' :   1000,
         'verbose'       :   'min'
     }
     re = learn_grammar(**kwargs)
     # 81019 changes:    # FIXME: DEL comments
     # a, q, qa = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs)
     # print('parse-ability, parse-quality:', a, q)
     # assert a*q > 0.99
     # self.assertTrue(a*q*Decimal("100") > 0.99, str(a) + " * " + str(q) + " * 100 !> 0.99")
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs)
     # pa, f1, precision, recall: <float> 0.0 - 1.0
     self.assertTrue(pa*recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
Example #4
0
 def test_turtle_generalize_rules(self):
     base  = module_path + '/tests/data/POC-Turtle/' + \
         'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict'
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10]
     prj_dir = batch_dir + '/turtle_lw_&_dot_generalized_rules/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     kwargs = {
         'input_parses'  :   input_parses,
         'output_grammar':   outpath,
         'left_wall'     :   'LEFT-WALL' ,
         'period'        :   True        ,
         'context'       :   2           ,
         'word_space'    :   'discrete'  ,
         'dim_reduction' :   'none'      ,
         'clustering'    :   'group'     ,
         'grammar_rules' :   2           ,
         'categories_generalization' :   'off'    ,
         'rules_generalization'      :   'jaccard',
         'tmpath'        :   module_path + '/tmp/',
         'verbose'       :   'none'
     }
     response = learn_grammar(**kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f: lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         assert rule_list == base_list
     else: assert len(rule_list) == len(base_list)
Example #5
0
def main(argv):
    """ Usage: python tstr.py config.json """
    print('\nGrammar Tester v.' + __version__, 'started', UTC(),
          '| Python v.' + platform.python_version(), '\n')
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        print('''Usage: tstr <json-config-file>''')
        sys.exit()
    for opt in opts:
        if opt == '-h':
            print('''Usage: tstr <json-config-file>''')
            sys.exit()
    else:
        config_json = args[0]

    with open(config_json) as f:
        kwargs = json.load(f)

    re = {}

    if 'input_grammar' in kwargs:  # Test .dict file    # 90123
        ig = module_path + kwargs['input_grammar']
        og = module_path + kwargs['out_path']           # og: output grammar
        rp = module_path + kwargs['reference']          # rp: reference path
        if 'test_corpus' in kwargs:
            cp = module_path + kwargs['test_corpus']    # cp: corpus path
        else:
            cp = rp  # test corpus path = reference parses path
        print('Input grammar:', ig, '\nOutput directory:', og)
        if check_dir(og, True, 'max'):
            print('Grammar test started', UTC(), '\n')
            start = time.time()
            a, f1, precision, q = pqa_meter(ig, og, cp, rp, **kwargs)
            re.update({'grammar_test_time': sec2string(time.time() - start)})
        else:
            print('Output path error:', og)
    else:
        print('Please set "input grammar" in config.json')
        sys.exit()

    stats = []
    if 'grammar_test_time' in re:
        stats.append(['Grammar test time ', re['grammar_test_time']])
    if len(stats) > 0:
        list2file(stats, og + '/test_stats.txt')

    copy(config_json, og)
    #with open(re['project_directory'] + '/grammar_learner_log.json', 'w') as f:
    #    f.write(json.dumps(re))

    print('\nGrammar learning and the learned grammar test ended', UTC())
    #print(test_stats(re))
    #print('Output directory:', re['project_directory'], '\n')
    print(f'PA = {int(round(a*100,0))}%, PQ = {int(round(q*100,0))}%, '
          f'F1 = {round(f1,2)}')
Example #6
0
def main(argv):
    """ Usage: python ppln.py config.json """
    print('\nGrammar Learner + Tester ppln v.' + __version__, 'started', UTC(),
          '| Python v.' + platform.python_version(), '\n')
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        print('''Usage: ppln <json-config-file>''')
        sys.exit()
    for opt in opts:
        if opt == '-h':
            print('''Usage: ppln <json-config-file>''')
            sys.exit()
    else:
        config_json = args[0]

    with open(config_json) as f:
        kwargs = json.load(f)

    corpus = kwargs['corpus']
    del kwargs['corpus']
    dataset = kwargs['dataset']
    del kwargs['dataset']
    if 'input_parses' not in kwargs:
        kwargs['input_parses'] = '/data/' + corpus + '/' + dataset

    line = [[0, corpus, dataset, 0, 0, kwargs['rules_generalization']]]
    out_path = module_path + kwargs['out_path']
    rp = module_path + kwargs['reference']
    if 'test_corpus' in kwargs:
        cp = module_path + kwargs['test_corpus']
    else:
        cp = rp  # test corpus path = reference parses path
    if 'tmpath' not in kwargs:
        kwargs['tmp_dir'] = ''
    else:
        if len(kwargs['tmpath']) == 0:
            kwargs['tmp_dir'] = ''
        else:
            if 'home' in kwargs['tmpath']:
                tmpath = kwargs['tmpath']
            else:
                tmpath = module_path + kwargs['tmpath']
            if check_dir(tmpath, True, 'none'):
                kwargs['tmp_dir'] = tmpath
            else:
                kwargs['tmp_dir'] = ''

    a, _, hdr, log, rules = wide_rows(line, out_path, cp, rp, (1, 1), **kwargs)

    copy(config_json, log['project_directory'])

    print('\nGrammar learning and the learned grammar test ended', UTC())
    print(test_stats(log))
    print('Output directory:', log['project_directory'], '\n')
Example #7
0
 def test_pqa_english_noamb_ddrkd_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/noamb_pqa_ddrkd_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt'
     rp = input_parses + '/poc-english_noAmb-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'vectors',
         'dim_reduction': 'svd',
         'clustering': ('kmeans', 'kmeans++', 18),
         'cluster_range': (12, 12, 5),
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     # Sometimes pqa_meter(with test_grammar updated 2018-10-19) returns pa,recall = 0,0
     # FIXME: check with further test_grammar updates and delete.
     x = 0.
     n = 0
     while x < 0.1:
         re = learn_grammar(**kwargs)
         pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath,
                                               cp, rp, **kwargs)
         print(
             f'\nnoAmb dDRKd: pa {round(pa,3)}, f1 {round(f1,3)}, precision {round(precision,3)}, recall {round(recall,3)} \n'
         )
         x = pa * recall
         n += 1
         if n > 24: break
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Example #8
0
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.learner import learn_grammar
from src.grammar_learner.pqa_table import pqa_meter
# from ull.grammartest.optconst import *

#base  = module_path + '/tests/data/POC-Turtle/' + \
#    'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict'

input_parses = module_path + '/tests/data/dataSymbols/dynsym/'
batch_dir = module_path + '/output/test_dynsym_' + str(UTC())[:10]
prj_dir = batch_dir + '/dynsym_rules_b/'
if check_dir(prj_dir, create=True, verbose='max'):
    outpath = prj_dir
kwargs = {'input_parses': input_parses, 'output_grammar': outpath}
response = learn_grammar(**kwargs)
with open(response['grammar_file'], 'r') as f:
    rules = f.read().splitlines()
rule_list = [line for line in rules if line[0:1] in ['"', '(']]
#with open(base, 'r') as f: lst = f.read().splitlines()
#base_list = [line for line in lst if line[0:1] in ['"', '(']]
#if len(rule_list) == len(base_list):
#    assert rule_list == base_list
#else:
#    assert len(rule_list) == len(base_list), f"\nlen(rule_list)={len(rule_list)}" \
#                                             f"\nlen(base_list)={len(base_list)}" \
#                                             f"\nrule_list:\n{rule_list}" \
#                                             f"\nbase_list:\n{base_list}"
Example #9
0
def main(argv):
    """ Usage: python ppln.py config.json """
    print('\nGrammar Learner + Tester ppln v.' + __version__, 'started', UTC(),
          '| Python v.' + platform.python_version(), '\n')
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        print('''Usage: ppln <json-config-file>''')
        sys.exit()
    for opt in opts:
        if opt == '-h':
            print('''Usage: ppln <json-config-file>''')
            sys.exit()
    else:
        config_json = args[0]

    with open(config_json) as f:
        kwargs = json.load(f)

    corpus = kwargs['corpus']
    del kwargs['corpus']
    dataset = kwargs['dataset']
    del kwargs['dataset']
    if 'input_parses' not in kwargs:
        kwargs[
            'input_parses'] = module_path + '/data/' + corpus + '/' + dataset
    else:
        if '/home/' in kwargs['input_parses']:
            kwargs['input_parses'] = kwargs['input_parses']
        else:
            kwargs['input_parses'] = module_path + kwargs['input_parses']
    if 'output_grammar' not in kwargs:
        if 'out_path' in kwargs:
            if '/home/' in kwargs['out_path']:
                kwargs['output_grammar'] = kwargs['out_path']
            else:
                kwargs['output_grammar'] = module_path + kwargs['out_path']
        else:
            print('Please set "output_grammar" or "out_path" in config.json')
            sys.exit()
    if 'tmpath' not in kwargs:
        kwargs['tmp_dir'] = ''
    else:
        if len(kwargs['tmpath']) == 0:
            kwargs['tmp_dir'] = ''
        else:
            if 'home' in kwargs['tmpath']:
                tmpath = kwargs['tmpath']
            else:
                tmpath = module_path + kwargs['tmpath']
            if check_dir(tmpath, True, 'none'):
                kwargs['tmp_dir'] = tmpath
            else:
                kwargs['tmp_dir'] = ''

    rules, re = learn(**kwargs)
    if 'error' in re:
        print('Grammar Learner error log:\n', re)
        sys.exit()

    if kwargs['linkage_limit'] > 0:
        og = module_path + kwargs['out_path']
        rp = module_path + kwargs['reference']
        if 'test_corpus' in kwargs:
            cp = module_path + kwargs['test_corpus']
        else:
            cp = rp  # test corpus path = reference parses path
        start = time.time()
        a, f1, precision, q = pqa_meter(re['grammar_file'], og, cp, rp,
                                        **kwargs)
        re.update({'grammar_test_time': sec2string(time.time() - start)})

    stats = []
    if 'grammar_learn_time' in re:
        stats.append(['Grammar learn time', re['grammar_learn_time']])
    if 'grammar_test_time' in re:
        stats.append(['Grammar test time ', re['grammar_test_time']])
    if len(stats) > 0:
        x = re['corpus_stats_file']
        list2file(stats, x[:x.rfind('/')] + '/learn_&_test_stats.txt')

    copy(config_json, re['project_directory'])
    with open(re['project_directory'] + '/grammar_learner_log.json', 'w') as f:
        f.write(json.dumps(re))

    print('\nGrammar learning and the learned grammar test ended', UTC())
    print(test_stats(re))
    print('Output directory:', re['project_directory'], '\n')