def test_dbpedia_loading(): adjpath = abspath(expanduser('~/Projects/truthy_data/dbpedia/2016-04/processed/kg/adjacency.npy')) shape = (6060993, 6060993, 663) dirpath = join(dirname(adjpath), '_undir') G = Graph.reconstruct(dirpath, shape, sym=True) assert np.all(G.csr.indices >= 0) # reverse graph dirpath = join(dirname(adjpath), '_revundir') revG = Graph.reconstruct(dirpath, shape, sym=True) assert np.all(revG.csr.indices >= 0)
def test_graph1_creation(): shape = np.asarray([4, 4, 2], dtype=np.int32) adj = np.asarray([ [0, 1, 0], [0, 2, 1], [1, 2, 0], [1, 2, 1], [1, 3, 1], [2, 3, 0], [2, 1, 1], [2, 3, 1], ], dtype=np.int32) values = np.arange(adj.shape[0]) + 10. # create graph expect_G = np.asarray([[0., 1., 0., 0., 0., 0., 1., 0.], [1., 0., 1., 0., 0., 0., 1., 1.], [0., 1., 0., 1., 1., 1., 0., 1.], [0., 0., 1., 0., 0., 1., 1., 0.]]) G = make_graph(adj, shape, sym=True, save_csc=True) assert np.array_equal(G.csr.toarray(), G.csc.toarray()) dirpath = join(abspath(expanduser(os.curdir)), '_undir') if not exists(dirpath): os.mkdir(dirpath) G.save_graph(dirpath) assert np.array_equal(G.indeg_vec, np.asarray([2, 3, 3, 2])) assert np.array_equal(expect_G, G.csr.toarray()) # rebuild graph G = Graph.reconstruct(dirpath, shape, sym=True, save_csc=True) assert np.array_equal(expect_G, G.csr.toarray()) if exists(dirpath): shutil.rmtree(dirpath) print('Removed: %s' % dirpath)
def test_dbpedia(): dirpath = abspath(expanduser('./data/kg/_undir/')) shape = (6060993, 6060993, 663) G = Graph.reconstruct(dirpath, shape, sym=True) cost_vec = np.log(G.indeg_vec) s, p, o = 2145431, 178, 459128 # Gravity, Alfonso CuarĂ³n mincostflow = succ_shortest_path(G, cost_vec, s, p, o) print mincostflow
def main(args=None): # parse arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-m', type=str, required=True, dest='method', help='Method to use: stream, relklinker, klinker, \ predpath, pra, katz, pathent, simrank, adamic_adar, jaccard, degree_product.' ) parser.add_argument('-d', type=str, required=True, dest='dataset', help='Dataset to test on.') parser.add_argument('-o', type=str, required=True, dest='outdir', help='Path to the output directory.') args = parser.parse_args() # logging disable_logging(log.DEBUG) if args.method not in ('stream', 'relklinker', 'klinker', 'predpath', 'pra', 'katz', 'pathent', 'simrank', 'adamic_adar', 'jaccard', 'degree_product'): raise Exception('Invalid method specified.') # ensure input file and output directory is valid. outdir = abspath(expanduser(args.outdir)) assert exists(outdir) args.outdir = outdir datafile = abspath(expanduser(args.dataset)) assert exists(datafile) args.dataset = datafile log.info('Launching {}..'.format(args.method)) log.info('Dataset: {}'.format(basename(args.dataset))) log.info('Output dir: {}'.format(args.outdir)) # read data df = pd.read_table(args.dataset, sep=',', header=0) log.info('Read data: {} {}'.format(df.shape, basename(args.dataset))) spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid']) log.info('Note: Found non-NA records: {}'.format(spo_df.shape)) df = spo_df[['sid', 'pid', 'oid']].values subs, preds, objs = df[:, 0].astype(_int), df[:, 1].astype( _int), df[:, 2].astype(_int) # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) # execute base = splitext(basename(args.dataset))[0] t1 = time() if args.method == 'stream': # KNOWLEDGE STREAM (KS) # compute min. cost flow log.info('Computing KS for {} triples..'.format(spo_df.shape[0])) with warnings.catch_warnings(): warnings.simplefilter("ignore") outjson = join(args.outdir, 'out_kstream_{}_{}.json'.format(base, DATE)) outcsv = join(args.outdir, 'out_kstream_{}_{}.csv'.format(base, DATE)) mincostflows, times = compute_mincostflow(G, relsim, subs, preds, objs, outjson) # save the results spo_df['score'] = mincostflows spo_df['time'] = times spo_df = normalize(spo_df) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info( 'Mincostflow computation complete. Time taken: {:.2f} secs.\n'. format(time() - t1)) elif args.method == 'relklinker': # RELATIONAL KNOWLEDGE LINKER (KL-REL) log.info('Computing KL-REL for {} triples..'.format(spo_df.shape[0])) scores, paths, rpaths, times = compute_relklinker( G, relsim, subs, preds, objs) # save the results spo_df['score'] = scores spo_df['path'] = paths spo_df['rpath'] = rpaths spo_df['time'] = times spo_df = normalize(spo_df) outcsv = join(args.outdir, 'out_relklinker_{}_{}.csv'.format(base, DATE)) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info( 'Relatioanal KL computation complete. Time taken: {:.2f} secs.\n'. format(time() - t1)) elif args.method == 'klinker': log.info('Computing KL for {} triples..'.format(spo_df.shape[0])) scores, paths, rpaths, times = compute_klinker(G, subs, preds, objs) # save the results spo_df['score'] = scores spo_df['path'] = paths spo_df['rpath'] = rpaths spo_df['time'] = times spo_df = normalize(spo_df) outcsv = join(args.outdir, 'out_klinker_{}_{}.csv'.format(base, DATE)) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info('KL computation complete. Time taken: {:.2f} secs.\n'.format( time() - t1)) elif args.method == 'predpath': # PREDPATH vec, model = predpath_train_model(G, spo_df) # train print 'Time taken: {:.2f}s\n'.format(time() - t1) # save model predictor = {'dictvectorizer': vec, 'model': model} try: outpkl = join(args.outdir, 'out_predpath_{}_{}.pkl'.format(base, DATE)) with open(outpkl, 'wb') as g: pkl.dump(predictor, g, protocol=pkl.HIGHEST_PROTOCOL) print 'Saved: {}'.format(outpkl) except IOError, e: raise e
class Pra(object): name = 'pra' HOME = abspath(expanduser('./data/')) if not exists(HOME): print 'Data directory not found: %s' % HOME print 'Download data per instructions on:' print '\thttps://github.com/shiralkarprashant/knowledgestream#data' print 'and enter the directory path below.' data_dir = raw_input('\nPlease enter data directory path: ') if data_dir != '': data_dir = abspath(expanduser(data_dir)) if not os.path.isdir(data_dir): raise Exception('Entered path "%s" not a directory.' % data_dir) if not exists(data_dir): raise Exception('Directory does not exist: %s' % data_dir) HOME = data_dir # raise Exception('Please set HOME to data directory in algorithms/__main__.py') PATH = join(HOME, 'kg/_undir/') assert exists(PATH) SHAPE = (6060993, 6060993, 663) WTFN = 'logdegree' # relational similarity using TF-IDF representation and cosine similarity RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy') assert exists(RELSIMPATH) # Date DATE = '{}'.format(date.today()) # data types for int and float _short = np.int16 _int = np.int32 _int64 = np.int64 _float = np.float # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) # ================= PRA ALGORITHM IMPLEMENTATION ============ @rpc # Methods are exposed to the outside world with entrypoint decorators (RPC in our case) def stream(self, data, args=None): print('\nThe following request in RDF format was passed:') print(data) identification, theDate, suri, puri, ouri = extract.getValues(data) print('\nSURI, PURI and OURI are:') print(suri) print(puri) print(ouri) print('\n') # sid, pid, oid = self.uriToId(suri, puri, ouri) sid, pid, oid = mapping.convert(suri, puri, ouri) # required for passing it to compute_mincostflow sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid]) t1 = time() print('\nTheir corresponding IDs are:') print(sid) print(pid) print(oid) print('\n') log.info('Computing Predpath for triple') int_sid = int(sid) int_pid = int(pid) int_oid = int(oid) print("The subject id is: %s " % int_sid) print("The predicate id is: %s" % int_pid) print("The object id is: %s" % int_oid) # Creating a dataframe data = { 'sid': [int_sid], 'pid': [int_pid], 'oid': [int_oid], 'class': [0] } #__________________test________________________ dfObj = pd.DataFrame(data) test_spo_df = dfObj.dropna(axis=0, subset=['sid', 'pid', 'oid', 'class']) test_model_pkl = open("./output/trained_pra_model.pkl", "rb") test_model = pkl.load(test_model_pkl) test_features_pkl = open("./output/pra_features_file.pkl", "rb") test_features = pkl.load(test_features_pkl) with warnings.catch_warnings(): try: warnings.simplefilter("ignore") # pra_predict() function is used to predict the triple's veracity array_value = pra_predict(self.G, test_features, test_model, test_spo_df) # test val = str(array_value)[1:-1] log.info( 'Predpath computation complete. Time taken: {:.2f} secs.\n' .format(time() - t1)) result = '<http://swc2017.aksw.org/task2/dataset/s-' + str( identification ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str( val) + '\"<http://www.w3.org/2001/XMLSchema#double> .' print('The result in RDF format is:') print(result) except MemoryError: print('\nA MemoryError is successfully caught.') result = 'MemoryError' return result
def main(args=None): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-d', type=str, required=True, dest='dataset', help='Dataset to test on.') parser.add_argument('-o', type=str, required=True, dest='outdir', help='Path to the output directory.') parser.add_argument('-m', type=str, required=True, dest='method', help='Method to use: stream, relklinker, klinker, \ predpath, sm') args = parser.parse_args() relsim = np.load(RELSIMPATH) outdir = abspath(expanduser(args.outdir)) assert exists(outdir) args.outdir = outdir datafile = abspath(expanduser(args.dataset)) assert exists(datafile) args.dataset = datafile LOGPATH = join(HOME, '../logs') assert exists(LOGPATH) base = splitext(basename(args.dataset))[0] log_file = join('logs/', 'log_{}_{}_{}.log'.format(args.method, base, DATE)) log.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %H:%M:%S %p', filename=log_file, level=log.DEBUG) log.getLogger().addHandler(log.StreamHandler()) log.info('Launching {}..'.format(args.method)) log.info('Dataset: {}'.format(basename(args.dataset))) log.info('Output dir: {}'.format(args.outdir)) # read data df = pd.read_table(args.dataset, sep=',', header=0) log.info('Read data: {} {}'.format(df.shape, basename(args.dataset))) spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid']) log.info('Note: Found non-NA records: {}'.format(spo_df.shape)) df = spo_df[['sid', 'pid', 'oid']].values subs, preds, objs = df[:, 0].astype(_int), df[:, 1].astype( _int), df[:, 2].astype(_int) # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) t1 = time() if args.method == 'stream': # KNOWLEDGE STREAM (KS) # compute min. cost flow log.info('Computing KS for {} triples..'.format(spo_df.shape[0])) with warnings.catch_warnings(): warnings.simplefilter("ignore") outjson = join(args.outdir, 'out_kstream_{}_{}.json'.format(base, DATE)) outcsv = join(args.outdir, 'out_kstream_{}_{}.csv'.format(base, DATE)) mincostflows, times = compute_mincostflow(G, relsim, subs, preds, objs, outjson) # save the results spo_df['score'] = mincostflows spo_df['time'] = times spo_df = normalize(spo_df) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info( 'Mincostflow computation complete. Time taken: {:.2f} secs.\n'. format(time() - t1)) elif args.method == 'relklinker': # RELATIONAL KNOWLEDGE LINKER (KL-REL) log.info('Computing KL-REL for {} triples..'.format(spo_df.shape[0])) scores, paths, rpaths, times = compute_relklinker( G, relsim, subs, preds, objs) # save the results spo_df['score'] = scores spo_df['path'] = paths spo_df['rpath'] = rpaths spo_df['time'] = times spo_df = normalize(spo_df) outcsv = join(args.outdir, 'out_relklinker_{}_{}.csv'.format(base, DATE)) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info( 'Relatioanal KL computation complete. Time taken: {:.2f} secs.\n'. format(time() - t1)) elif args.method == 'klinker': log.info('Computing KL for {} triples..'.format(spo_df.shape[0])) scores, paths, rpaths, times = compute_klinker(G, subs, preds, objs) # save the results spo_df['score'] = scores spo_df['path'] = paths spo_df['rpath'] = rpaths spo_df['time'] = times spo_df = normalize(spo_df) outcsv = join(args.outdir, 'out_klinker_{}_{}.csv'.format(base, DATE)) spo_df.to_csv(outcsv, sep=',', header=True, index=False) log.info('* Saved results: %s' % outcsv) log.info('KL computation complete. Time taken: {:.2f} secs.\n'.format( time() - t1)) elif args.method == 'predpath': # PREDPATH vec, model = predpath_train_model(G, spo_df) # train # vec, model = predpath_train_model(G, spo_df, relsim) print 'Time taken: {:.2f}s\n'.format(time() - t1) # save model predictor = {'dictvectorizer': vec, 'model': model} try: outpkl = join(args.outdir, 'out_predpath_{}_{}.pkl'.format(base, DATE)) with open(outpkl, 'wb') as g: pkl.dump(predictor, g, protocol=pkl.HIGHEST_PROTOCOL) print 'Saved: {}'.format(outpkl) except IOError, e: raise e
class KnowledgeLinker(object): name = 'klinker' HOME = abspath(expanduser('./data/')) if not exists(HOME): print 'Data directory not found: %s' % HOME print 'Download data per instructions on:' print '\thttps://github.com/shiralkarprashant/knowledgestream#data' print 'and enter the directory path below.' data_dir = raw_input('\nPlease enter data directory path: ') if data_dir != '': data_dir = abspath(expanduser(data_dir)) if not os.path.isdir(data_dir): raise Exception('Entered path "%s" not a directory.' % data_dir) if not exists(data_dir): raise Exception('Directory does not exist: %s' % data_dir) HOME = data_dir # raise Exception('Please set HOME to data directory in algorithms/__main__.py') PATH = join(HOME, 'kg/_undir/') assert exists(PATH) SHAPE = (6060993, 6060993, 663) WTFN = 'logdegree' # relational similarity using TF-IDF representation and cosine similarity RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy') assert exists(RELSIMPATH) # Date DATE = '{}'.format(date.today()) # data types for int and float _short = np.int16 _int = np.int32 _int64 = np.int64 _float = np.float # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) # ================= KNOWLEDGE LINKER ALGORITHM ============ def compute_klinker(self, G, sid, pid, oid): """ Parameters: ----------- G: rgraph See `datastructures`. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. Returns: -------- scores, paths, rpaths, times: sequence One sequence each for the proximity scores, shortest path in terms of nodes, shortest path in terms of relation sequence, and times taken. """ # set weights indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN).reshape( (1, G.N)) indegsim = indegsim.ravel() targets = G.csr.indices % G.N specificity_wt = indegsim[targets] # specificity G.csr.data = specificity_wt.copy() # back up data = G.csr.data.copy() indices = G.csr.indices.copy() indptr = G.csr.indptr.copy() # compute closure scores, paths, rpaths, times = [], [], [], [] for idx, (s, p, o) in enumerate(zip(sid, pid, oid)): print '{}. Working on {}..'.format(idx + 1, (s, p, o)), ts = time() rp = closure(G, s, p, o, kind='metric', linkpred=True) tend = time() print 'time: {:.2f}s'.format(tend - ts) times.append(tend - ts) scores.append(rp.score) paths.append(rp.path) rpaths.append(rp.relational_path) # reset graph G.csr.data = data.copy() G.csr.indices = indices.copy() G.csr.indptr = indptr.copy() sys.stdout.flush() log.info('') return scores, paths, rpaths, times @rpc # Methods are exposed to the outside world with entrypoint decorators (RPC in our case) def stream(self, data): print('\nThe following request in RDF format was passed:') print(data) identification, theDate, suri, puri, ouri = extract.getValues(data) print('\nSURI, PURI and OURI are:') print(suri) print(puri) print(ouri) print('\n') # sid, pid, oid = self.uriToId(suri, puri, ouri) sid, pid, oid = mapping.convert(suri, puri, ouri) # required for passing it to compute_mincostflow sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid]) t1 = time() print('\nTheir corresponding IDs are:') print(sid) print(pid) print(oid) print('\n') log.info('Computing KL for triple') with warnings.catch_warnings(): try: warnings.simplefilter("ignore") # compute klinker scores, paths, rpaths, times = self.compute_klinker( self.G, sid, pid, oid) normalizedScore = normalization.score(scores[0]) log.info( 'KLinker computation complete. Time taken: {:.2f} secs.\n'. format(time() - t1)) result = '<http://swc2017.aksw.org/task2/dataset/s-' + str( identification ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str( normalizedScore ) + '\"<http://www.w3.org/2001/XMLSchema#double> .' print('The result in RDF format is:') print(result) except MemoryError: print('\nA MemoryError is successfully caught.') result = 'MemoryError' return result
class KnowledgeStream(object): name = 'kstream' HOME = abspath(expanduser('./data/')) if not exists(HOME): print 'Data directory not found: %s' % HOME print 'Download data per instructions on:' print '\thttps://github.com/shiralkarprashant/knowledgestream#data' print 'and enter the directory path below.' data_dir = raw_input('\nPlease enter data directory path: ') if data_dir != '': data_dir = abspath(expanduser(data_dir)) if not os.path.isdir(data_dir): raise Exception('Entered path "%s" not a directory.' % data_dir) if not exists(data_dir): raise Exception('Directory does not exist: %s' % data_dir) HOME = data_dir # raise Exception('Please set HOME to data directory in algorithms/__main__.py') PATH = join(HOME, 'kg/_undir/') assert exists(PATH) SHAPE = (6060993, 6060993, 663) WTFN = 'logdegree' # relational similarity using TF-IDF representation and cosine similarity RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy') assert exists(RELSIMPATH) # Date DATE = '{}'.format(date.today()) # data types for int and float _short = np.int16 _int = np.int32 _int64 = np.int64 _float = np.float # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) # ================= KNOWLEDGE STREAM ALGORITHM ============ def compute_mincostflow(self, G, relsim, sid, pid, oid): """ Parameters: ----------- G: rgraph See `datastructures`. relsim: ndarray A square matrix containing relational similarity scores. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. flowfile: str Absolute path of the file where flow will be stored as JSON, one line per triple. Returns: -------- mincostflows: sequence A sequence containing total flow for each triple. times: sequence Times taken to compute stream of each triple. """ # take graph backup G_bak = { 'data': G.csr.data.copy(), 'indices': G.csr.indices.copy(), 'indptr': G.csr.indptr.copy() } cost_vec_bak = np.log(G.indeg_vec).copy() # some set up G.sources = np.repeat(np.arange(G.N), np.diff(G.csr.indptr)) G.targets = G.csr.indices % G.N cost_vec = cost_vec_bak.copy() indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN) specificity_wt = indegsim[G.targets] # specificity relations = (G.csr.indices - G.targets) / G.N s, p, o = [int(x) for x in (sid, pid, oid)] ts = time() print '{}. Working on {} .. '.format(1, (s, p, o)), sys.stdout.flush() # set weights relsimvec = np.array(relsim[p, :]) # specific to predicate p relsim_wt = relsimvec[relations] G.csr.data = np.multiply(relsim_wt, specificity_wt) # compute mcflow = succ_shortest_path( G, cost_vec, s, p, o, return_flow=False, npaths=5 ) mincostflow = mcflow.flow tend = time() times = tend - ts print 'mincostflow: {:.5f}, #paths: {}, time: {:.2f}s.'.format( mcflow.flow, len(mcflow.stream['paths']), tend - ts ) # reset state of the graph np.copyto(G.csr.data, G_bak['data']) np.copyto(G.csr.indices, G_bak['indices']) np.copyto(G.csr.indptr, G_bak['indptr']) np.copyto(cost_vec, cost_vec_bak) return mincostflow, times @rpc # Methods are exposed to the outside world with entrypoint decorators (RPC in our case) def stream(self, data): print('\nThe following request in RDF format was passed:') print(data) identification, theDate, suri, puri, ouri = extract.getValues(data) print('\nSURI, PURI and OURI are:') print(suri) print(puri) print(ouri) print('\n') # sid, pid, oid = self.uriToId(suri, puri, ouri) sid, pid, oid = mapping.convert(suri, puri, ouri) # required for passing it to compute_mincostflow sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid]) t1 = time() print('\nTheir corresponding IDs are:') print(sid) print(pid) print(oid) print('\n') log.info('Computing KS for triple') with warnings.catch_warnings(): try: warnings.simplefilter("ignore") # compute min. cost flow mincostflows, times = self.compute_mincostflow(self.G, self.relsim, sid, pid, oid) # spo_df = self.normalize(spo_df) normalizedScore = normalization.score(mincostflows) log.info('Mincostflow computation complete. Time taken: {:.2f} secs.\n'.format(time() - t1)) result = '<http://swc2017.aksw.org/task2/dataset/s-' + str( identification) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str( normalizedScore) + '\"<http://www.w3.org/2001/XMLSchema#double> .' print('The result in RDF format is:') print(result) except MemoryError: print('\nA MemoryError is successfully caught.') result = 'MemoryError' return result
class DegreeProduct(object): name = 'degree_product' HOME = abspath(expanduser('./data/')) if not exists(HOME): print 'Data directory not found: %s' % HOME print 'Download data per instructions on:' print '\thttps://github.com/shiralkarprashant/knowledgestream#data' print 'and enter the directory path below.' data_dir = raw_input('\nPlease enter data directory path: ') if data_dir != '': data_dir = abspath(expanduser(data_dir)) if not os.path.isdir(data_dir): raise Exception('Entered path "%s" not a directory.' % data_dir) if not exists(data_dir): raise Exception('Directory does not exist: %s' % data_dir) HOME = data_dir # raise Exception('Please set HOME to data directory in algorithms/__main__.py') PATH = join(HOME, 'kg/_undir/') assert exists(PATH) SHAPE = (6060993, 6060993, 663) WTFN = 'logdegree' # relational similarity using TF-IDF representation and cosine similarity RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy') assert exists(RELSIMPATH) # Date DATE = '{}'.format(date.today()) # data types for int and float _short = np.int16 _int = np.int32 _int64 = np.int64 _float = np.float # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) # ================= DEGREE PRODUCT ALGORITHM ============ def compute_degree_product(self, G, subs, preds, objs): """ Performs link prediction using a specified measure, such as Katz or SimRank. Parameters: ----------- G: rgraph See `datastructures`. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. Returns: -------- scores, times: sequence One sequence each for the proximity scores and times taken. """ measure_map = { 'degree_product': { 'measure': preferential_attachment, 'tag': 'PA' } } selected_measure = 'degree_product' # back up data = G.csr.data.copy() indices = G.csr.indices.copy() indptr = G.csr.indptr.copy() # compute closure measure_name = measure_map[selected_measure]['tag'] measure = measure_map[selected_measure]['measure'] log.info('Computing {} for {} triples..'.format( measure_name, len(subs))) t1 = time() scores, times = [], [] for idx, (s, p, o) in enumerate(zip(subs, preds, objs)): print '{}. Working on {}..'.format(idx + 1, (s, p, o)), sys.stdout.flush() ts = time() score = measure(G, s, p, o, linkpred=True) tend = time() print 'score: {:.5f}, time: {:.2f}s'.format(score, tend - ts) times.append(tend - ts) scores.append(score) # reset graph G.csr.data = data.copy() G.csr.indices = indices.copy() G.csr.indptr = indptr.copy() sys.stdout.flush() print '' return scores, times @rpc # Methods are exposed to the outside world with entrypoint decorators (RPC in our case) def stream(self, data): print('\nThe following request in RDF format was passed:') print(data) identification, theDate, suri, puri, ouri = extract.getValues(data) print('\nSURI, PURI and OURI are:') print(suri) print(puri) print(ouri) print('\n') # sid, pid, oid = self.uriToId(suri, puri, ouri) sid, pid, oid = mapping.convert(suri, puri, ouri) # required for passing it to compute_degree_product sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid]) t1 = time() print('\nTheir corresponding IDs are:') print(sid) print(pid) print(oid) print('\n') log.info('Computing PA for triple') with warnings.catch_warnings(): try: warnings.simplefilter("ignore") # compute degree_product scores, times = self.compute_degree_product( self.G, sid, pid, oid) normalizedScore = normalization.score(scores[0]) log.info( 'DegreeProduct computation complete. Time taken: {:.2f} secs.\n' .format(time() - t1)) result = '<http://swc2017.aksw.org/task2/dataset/s-' + str( identification ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str( normalizedScore ) + '\"<http://www.w3.org/2001/XMLSchema#double> .' print('The result in RDF format is:') print(result) except MemoryError: print('\nA MemoryError is successfully caught.') result = 'MemoryError' return result
class Predpath(object): name = 'pra' HOME = abspath(expanduser('./data/')) if not exists(HOME): print 'Data directory not found: %s' % HOME print 'Download data per instructions on:' print '\thttps://github.com/shiralkarprashant/knowledgestream#data' print 'and enter the directory path below.' data_dir = raw_input('\nPlease enter data directory path: ') if data_dir != '': data_dir = abspath(expanduser(data_dir)) if not os.path.isdir(data_dir): raise Exception('Entered path "%s" not a directory.' % data_dir) if not exists(data_dir): raise Exception('Directory does not exist: %s' % data_dir) HOME = data_dir # raise Exception('Please set HOME to data directory in algorithms/__main__.py') PATH = join(HOME, 'kg/_undir/') assert exists(PATH) SHAPE = (6060993, 6060993, 663) WTFN = 'logdegree' # relational similarity using TF-IDF representation and cosine similarity RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy') assert exists(RELSIMPATH) # Date DATE = '{}'.format(date.today()) # data types for int and float _short = np.int16 _int = np.int32 _int64 = np.int64 _float = np.float # load knowledge graph G = Graph.reconstruct(PATH, SHAPE, sym=True) # undirected assert np.all(G.csr.indices >= 0) # relational similarity relsim = np.load(RELSIMPATH) #__________________train_______________________ # ensure input file and output directory is valid. outdir = abspath(expanduser('./output')) assert exists(outdir) #sample data file consisting of records that is used to train the model. datafile = abspath(expanduser('./datasets/sample_data_pra.csv')) assert exists(datafile) log.info('Dataset: {}'.format(basename(datafile))) # Date DATE = '{}'.format(date.today()) # read data df = pd.read_table(datafile, sep=',', header=0) log.info('Read data: {} {}'.format(df.shape, basename(datafile))) spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid']) log.info('Note: Found non-NA records: {}'.format(spo_df.shape)) # execute base = splitext(basename(datafile))[0] t1 = time() log.info('Computing pra for {} triples..'.format(spo_df.shape[0])) #function that trains the model features, model = pra_train_model(G, spo_df) # train print 'Time taken: {:.2f}s\n'.format(time() - t1) # save model predictor = {'dictvectorizer': features, 'model': model} try: outpkl = join(outdir, 'trained_pra_model.pkl') with open(outpkl, 'wb') as g: s = pkl.dump(model, g, protocol=pkl.HIGHEST_PROTOCOL) outpkl_features = join(outdir, 'pra_features_file.pkl') with open(outpkl_features, 'wb') as g: s = pkl.dump(features, g, protocol=pkl.HIGHEST_PROTOCOL) except IOError, e: raise e