def path_loader(length, gtps_filename=GTPS_FILENAME, sparql_endpoint=SPARQL_ENDPOINT, eval_data_graph=EVAL_DATA_GRAPH, load=True, clear=True, **kwds): gp = random_path(length) logger.info('Generated random graph pattern with path length %d:\n%s' % (length, gp)) # get list of semantic association pairs semantic_associations = get_semantic_associations( fn=gtps_filename, limit=None, ) gtps = semantic_associations triples = generate_triples(gp, gtps) if load: load_triples_into_endpoint(triples, sparql_endpoint=sparql_endpoint, graph=eval_data_graph, clear=clear) return gp
def main(): semantic_associations = get_semantic_associations( config.GT_ASSOCIATIONS_FILENAME) assocs_train, assocs_test = split_training_test_set(semantic_associations, variant='random') # setup node expander sparql = SPARQLWrapper(config.SPARQL_ENDPOINT) predict_set = assocs_test for method, query in sorted(prediction_queries.items()): target_idxs = [] for source, target in predict_set: prediction = predict_target_with_query(sparql, query, source) target_idxs.append(find_in_prediction(prediction, target)) print("'%s': %s," % (method, target_idxs))
def main(): from rdflib import Variable gp = GraphPattern(( (SOURCE_VAR, Variable('v1'), Variable('v2')), (TARGET_VAR, Variable('v3'), Variable('v2')), )) # get list of semantic association pairs and split in train and test sets semantic_associations = get_semantic_associations( fn='data/dbpedia_random_1000k_uri_pairs.csv.gz', limit=None, ) # assocs_train, assocs_test = split_training_test_set( # semantic_associations # ) # stps = tuple(sorted(assocs_train)) stps = semantic_associations triples = generate_triples(gp, stps) load_triples_into_endpoint(triples)
def main(): semantic_associations = get_semantic_associations( config.GT_ASSOCIATIONS_FILENAME) assocs_train, assocs_test = split_training_test_set( semantic_associations, variant='random' ) # setup node expander sparql = SPARQLWrapper(config.SPARQL_ENDPOINT) predict_set = assocs_test for method, query in sorted(prediction_queries.items()): target_idxs = [] for source, target in predict_set: prediction = predict_target_with_query(sparql, query, source) target_idxs.append(find_in_prediction(prediction, target)) print("'%s': %s," % (method, target_idxs))
def main(): semantic_associations = get_semantic_associations( config.GT_ASSOCIATIONS_FILENAME) assocs_train, assocs_test = split_training_test_set( semantic_associations, variant='random' ) # setup node expander sparql = SPARQLWrapper(config.SPARQL_ENDPOINT) predict_list = assocs_test # degree, pagerank and hits for method, query in sorted(prediction_queries.items()): target_idxs = [] for source, target in predict_list: logger.info( 'method: %s, predicting targets for %s, ground truth: %s', method, source.n3(), target.n3()) prediction = predict_target_with_query(sparql, query, source) idx = find_in_prediction(prediction, target) logger.info( format_prediction_results(method, prediction, target, idx)) target_idxs.append(idx) print("'%s': %s," % (method, target_idxs)) # milne-witten relatedness for method, pred in (('mw_wl', 'dbo:wikiPageWikiLink'),): target_idxs = [] for source, target in predict_list: logger.info( 'method: %s, predicting targets for %s, ground truth: %s', method, source.n3(), target.n3()) prediction = predict_target_with_milne_witten(sparql, pred, source) idx = find_in_prediction(prediction, target) logger.info( format_prediction_results(method, prediction, target, idx)) target_idxs.append(idx) print("'%s': %s," % (method, target_idxs))
def main(): semantic_associations = get_semantic_associations( config.GT_ASSOCIATIONS_FILENAME) assocs_train, assocs_test = split_training_test_set(semantic_associations, variant='random') # setup node expander sparql = SPARQLWrapper(config.SPARQL_ENDPOINT) predict_list = assocs_test # degree, pagerank and hits for method, query in sorted(prediction_queries.items()): target_idxs = [] for source, target in predict_list: logger.info( 'method: %s, predicting targets for %s, ground truth: %s', method, source.n3(), target.n3()) prediction = predict_target_with_query(sparql, query, source) idx = find_in_prediction(prediction, target) logger.info( format_prediction_results(method, prediction, target, idx)) target_idxs.append(idx) print("'%s': %s," % (method, target_idxs)) # milne-witten relatedness for method, pred in (('mw_wl', 'dbo:wikiPageWikiLink'), ): target_idxs = [] for source, target in predict_list: logger.info( 'method: %s, predicting targets for %s, ground truth: %s', method, source.n3(), target.n3()) prediction = predict_target_with_milne_witten(sparql, pred, source) idx = find_in_prediction(prediction, target) logger.info( format_prediction_results(method, prediction, target, idx)) target_idxs.append(idx) print("'%s': %s," % (method, target_idxs))
def main(): from rdflib import Variable # the following triple will timeout if vars_joint was 0: # ?s a owl:Thing . t? a owl:Thing . gp = GraphPattern(( (SOURCE_VAR, Variable('v1'), Variable('v2')), (TARGET_VAR, Variable('v3'), Variable('v2')), )) # get list of semantic association pairs and split in train and test sets semantic_associations = get_semantic_associations( fn='data/dbpedia_random_1000_uri_pairs.csv.gz', limit=100, ) # assocs_train, assocs_test = split_training_test_set( # semantic_associations # ) # stps = tuple(sorted(assocs_train)) stps = semantic_associations print(len(stps)) triples = generate_triples(gp, stps) for t in triples: print(t)
def main(): from rdflib import Variable # gp = GraphPattern(( # (SOURCE_VAR, Variable('v1'), Variable('v2')), # (TARGET_VAR, Variable('v3'), Variable('v2')), # )) gp = GraphPattern(( (Variable('v1'), Variable('v2'), SOURCE_VAR), (Variable('v1'), Variable('v3'), Variable('v4')), (Variable('v4'), Variable('v5'), TARGET_VAR), )) # get list of semantic association pairs and split in train and test sets semantic_associations = get_semantic_associations( fn='data/dbpedia_random_1000_uri_pairs.csv.gz', limit=None, ) # assocs_train, assocs_test = split_training_test_set( # semantic_associations # ) # stps = tuple(sorted(assocs_train)) stps = semantic_associations triples = generate_triples(gp, stps) load_triples_into_endpoint(triples)
from gp_learner import mutate_increase_dist from gp_learner import mutate_merge_var from gp_learner import mutate_simplify_pattern from graph_pattern import GraphPattern from graph_pattern import SOURCE_VAR from graph_pattern import TARGET_VAR from ground_truth_tools import get_semantic_associations from ground_truth_tools import split_training_test_set from gtp_scores import GTPScores logger = logging.getLogger(__name__) dbp = rdflib.Namespace('http://dbpedia.org/resource/') wikilink = URIRef('http://dbpedia.org/ontology/wikiPageWikiLink') ground_truth_pairs = get_semantic_associations() ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) gtp_scores = GTPScores(ground_truth_pairs) def test_mutate_increase_dist(): gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)]) res = mutate_increase_dist(gp) assert gp != res assert gp.diameter() + 1 == res.diameter() assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR} def test_mutate_merge_var(): p = Variable('p') q = Variable('q')
def main(**kwds): from eval.enumerate import load_pattern from eval.random_path_loader import path_loader from ground_truth_tools import get_semantic_associations from utils import log_all_exceptions logging.info('encoding check: äöüß🎅') # logging utf-8 byte string logging.info(u'encoding check: äöüß\U0001F385') # logging unicode string logging.info(u'encoding check: äöüß\U0001F385'.encode('utf-8')) # convert print('encoding check: äöüß🎅') # printing utf-8 byte string print(u'encoding check: äöüß\U0001F385') # printing unicode string if kwds['method'] == 'random_path': # inject triples for a random path of given length into endpoint eval_gp = path_loader(**kwds) result_filename = 'path_length_eval_result.txt' elif kwds['method'] == 'enum': from eval.data_generator import generate_triples from eval.data_loader import load_triples_into_endpoint seq = int(os.getenv('SEQ_NUMBER')) # see script/run_multi_range.sh eval_gp = load_pattern(kwds['length'], seq) logger.info( 'Loaded enumerated graph pattern number %d with length %d:\n%s' % ( seq, kwds['length'], eval_gp)) # get list of semantic association pairs gtps = get_semantic_associations( fn=kwds['GT_ASSOCIATIONS_FILENAME'], limit=None, ) triples = generate_triples(eval_gp, gtps) load_triples_into_endpoint( triples, sparql_endpoint=kwds['SPARQL_ENDPOINT'], graph=kwds['eval_data_graph'], ) result_filename = 'enum_eval_result.txt' else: raise NotImplementedError(kwds['method']) sparql_endpoint = kwds['sparql_endpoint'] gtps_filename = kwds['gtps_filename'] length = kwds['length'] gtps = tuple(sorted( get_semantic_associations(gtps_filename))) # for s, t in gtps: # print(curify(s)) # print(curify(t)) # print('') sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint) tic = datetime.utcnow() # noinspection PyBroadException try: pattern_found = log_all_exceptions(logger)(_main)(sparql, gtps) return_code = 0 if pattern_found else 1 tac = datetime.utcnow() logger.info( "search for pattern took %s and was %s", tac - tic, 'successful' if pattern_found else 'unsuccessful' ) except Exception: tac = datetime.utcnow() logger.exception( "search for pattern took %s and was aborted due to exception", tac - tic, ) return_code = 2 # return code's 0 is success, turn into more intuitive encoding for file res = {0: 1, 1: 0, 2: -1}[return_code] fn = path.join(config.RESDIR, result_filename) with open(make_dirs_for(fn), 'a') as f: f.write( 'len: %d, result: %d, took: %.1f s, end (UTC): %s\n' 'eval %s\n\n' % ( length, res, timedelta_to_s(tac - tic), datetime.utcnow(), eval_gp ) ) sys.exit(return_code)
# encoding: utf-8 import logging from ground_truth_tools import get_semantic_associations from ground_truth_tools import split_training_test_set from ground_truth_tools import k_fold_cross_validation logger = logging.getLogger(__name__) associations = get_semantic_associations() def test_split_train_test_set(): vr = split_training_test_set(associations) train, test = vr logger.info("just random: train: %d, test: %d", len(train), len(test)) vtnd = split_training_test_set(associations, variant='target_node_disjoint') train, test = vtnd logger.info("target node disjoint: train: %d, test: %d", len(train), len(test)) vnd = split_training_test_set(associations, variant='node_disjoint') train, test = vnd logger.info("node disjoint: train: %d, test: %d", len(train), len(test)) assert vr[0] == vtnd[0] == vnd[0], \ "train set shouldn't be influenced by different splitting variant" assert set(vr[1]) > set(vtnd[1]) > set(vnd[1]), \ "test set expected to shrink for more restrictive splitting variants" def test_k_fold_cross_validation():
def main(**kwds): from eval.enumerate import load_pattern from eval.random_path_loader import path_loader from ground_truth_tools import get_semantic_associations from utils import log_all_exceptions logging.info('encoding check: äöüß🎅') # logging utf-8 byte string logging.info( u'encoding check: äöüß\U0001F385') # logging unicode string logging.info( u'encoding check: äöüß\U0001F385'.encode('utf-8')) # convert print('encoding check: äöüß🎅') # printing utf-8 byte string print(u'encoding check: äöüß\U0001F385') # printing unicode string if kwds['method'] == 'random_path': # inject triples for a random path of given length into endpoint eval_gp = path_loader(**kwds) result_filename = 'path_length_eval_result.txt' elif kwds['method'] == 'enum': from eval.data_generator import generate_triples from eval.data_loader import load_triples_into_endpoint seq = int(os.getenv('SEQ_NUMBER')) # see script/run_multi_range.sh eval_gp = load_pattern(kwds['length'], seq) logger.info( 'Loaded enumerated graph pattern number %d with length %d:\n%s' % (seq, kwds['length'], eval_gp)) # get list of semantic association pairs gtps = get_semantic_associations( fn=kwds['GT_ASSOCIATIONS_FILENAME'], limit=None, ) triples = generate_triples(eval_gp, gtps) load_triples_into_endpoint( triples, sparql_endpoint=kwds['SPARQL_ENDPOINT'], graph=kwds['eval_data_graph'], ) result_filename = 'enum_eval_result.txt' else: raise NotImplementedError(kwds['method']) sparql_endpoint = kwds['sparql_endpoint'] gtps_filename = kwds['gtps_filename'] length = kwds['length'] gtps = tuple(sorted(get_semantic_associations(gtps_filename))) # for s, t in gtps: # print(curify(s)) # print(curify(t)) # print('') sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint) tic = datetime.utcnow() # noinspection PyBroadException try: pattern_found = log_all_exceptions(logger)(_main)(sparql, gtps) return_code = 0 if pattern_found else 1 tac = datetime.utcnow() logger.info("search for pattern took %s and was %s", tac - tic, 'successful' if pattern_found else 'unsuccessful') except Exception: tac = datetime.utcnow() logger.exception( "search for pattern took %s and was aborted due to exception", tac - tic, ) return_code = 2 # return code's 0 is success, turn into more intuitive encoding for file res = {0: 1, 1: 0, 2: -1}[return_code] fn = path.join(config.RESDIR, result_filename) with open(make_dirs_for(fn), 'a') as f: f.write('len: %d, result: %d, took: %.1f s, end (UTC): %s\n' 'eval %s\n\n' % (length, res, timedelta_to_s(tac - tic), datetime.utcnow(), eval_gp)) sys.exit(return_code)
# encoding: utf-8 from __future__ import absolute_import from __future__ import division from __future__ import print_function import logging from ground_truth_tools import get_semantic_associations from ground_truth_tools import split_training_test_set from ground_truth_tools import k_fold_cross_validation logger = logging.getLogger(__name__) associations = get_semantic_associations() def test_split_train_test_set(): vr = split_training_test_set(associations) train, test = vr logger.info("just random: train: %d, test: %d", len(train), len(test)) vtnd = split_training_test_set(associations, variant='target_node_disjoint') train, test = vtnd logger.info("target node disjoint: train: %d, test: %d", len(train), len(test)) vnd = split_training_test_set(associations, variant='node_disjoint') train, test = vnd logger.info("node disjoint: train: %d, test: %d", len(train), len(test)) assert vr[0] == vtnd[0] == vnd[0], \ "train set shouldn't be influenced by different splitting variant" assert set(vr[1]) > set(vtnd[1]) > set(vnd[1]), \