def rdfconvert(): parser = argparse.ArgumentParser( description='rdftools v{0}, rdf converter, based on libraptor2'.format(str_version)) parser.add_argument('source', metavar='SOURCE', type=str, help='the source file or location (of files) to be converted') parser.add_argument('--clear', dest='clear', action='store_true', help='clear the original files (delete) - this action is permanent, use with caution!') parser.add_argument('--dst_format', dest='dst_format', action='store', type=str, default='ntriples', help='the destination format to convert to. Supported parsers: %s. Supported serializers %s.' % ( pformat(RaptorRdf.parsers), pformat(RaptorRdf.serializers))) parser.add_argument('--buffer_size', dest='buffer_size', action='store', type=long, default=64, help='the buffer size in Mb of the input buffer (the parser will only parse XX Mb at a time)') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using version {0}'.format(str_version)) else: rdf_converter = RaptorRdf() rdf_converter(args.source, destination_format=args.dst_format, buffer_size=args.buffer_size, clear=args.clear) logger.info('done')
def genlubm(): parser = argparse.ArgumentParser( description='rdftools v{0}, lubm dataset generator wrapper (bundled) - requires java'.format(str_version)) parser.add_argument('output', metavar='OUTPUT', type=str, help='the location in which to save the generated distributions') parser.add_argument('--univ', dest='univ', action='store', type=long, default=1, help='number of universities to generate') parser.add_argument('--index', dest='index', action='store', type=long, default=0, help='start university') parser.add_argument('--seed', dest='seed', action='store', type=long, default=0, help='the seed') parser.add_argument('--ontology', dest='ontology', action='store', type=str, default=None, help='the lubm ontology') parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1, help='the number of workers (default -1 : all cpus)') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using rdftools version {0}'.format(str_version)) else: lubm_generator = Lubm(ontology=args.ontology, path=args.output) lubm_generator(args.univ, args.index, args.seed, workers=args.workers) logger.info('done')
def _generate(self, **kwargs): uni_key = 'University' uni_ext = '.nt' get_uni_id = lambda uni_file: int(uni_file.replace(uni_key, '').replace(uni_ext, '').strip()) universities_rdf = { get_uni_id(f): os.path.join(self.output_path, f) for f in os.listdir(self.output_path) if f.startswith(uni_key) } pool = Pool(processes=self.num_workers) for uni_id, uni_rdf in universities_rdf.iteritems(): pool.apply_async(self.distributor(uni_id, uni_rdf), kwds=self._distributor_kwargs(uni_id, uni_rdf)) pool.close() pool.join() # concat files site_files = lambda site_id: re.findall(r'site_{0}_uni_[0-9]+\.nt'.format(site_id), ' '.join(os.listdir(self._output_path))) for site in xrange(self.num_sites): site_parts = site_files(site) logger.info('[site = %s] site file parts = %s', site, site_parts) with io.open(self.site_path(site), 'w+') as SITE: for spart in site_parts: spart_file = os.path.join(self._output_path, spart) with io.open(spart_file, 'r+') as SPART: SITE.write(SPART.read()) sh.rm(spart_file)
def _distribute_triples(self, triples, permutation='s'): logger.info('[distributing] university %s by %s', self.uni_name, permutation) site_index = HashPartitioner(self.uni_rdf, num_sites=self.num_sites, permutation=permutation)() site_triples = defaultdict(list) sites = [0 for i in xrange(self.num_sites)] for i, triple in enumerate(triples): sites[site_index[i]] += 1 site_triples[site_index[i]].append(triple) logger.info('university %s total triples = %s, distribution = %s', self.uni_rdf, len(triples), sites) return site_triples
def rdfencode(): parser = argparse.ArgumentParser(description='rdftools v{0}, encode the RDF file(s)'.format(str_version)) parser.add_argument('source', metavar='SOURCE', type=str, help='the source file or location (of files) to be encoded') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using version {0}'.format(str_version)) else: encoder = RdfEncoder(args.source) encoder() logger.info('done')
def genvoid(): parser = argparse.ArgumentParser( description='rdftools v{0}, generate void statistics for RDF files'.format(str_version)) parser.add_argument('source', metavar='SOURCE', type=str, help='the source file to be analized') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using version {0}'.format(str_version)) else: void_generator = VoIDGen(args.source) stats = void_generator() logger.info('Collected Statistics (VoID): \n{0}'.format(pformat(stats)))
def _distribute_triples(self, triples, uni_site_distro=None, sorted_pdist=None): if not isinstance(uni_site_distro, list) and len(uni_site_distro) > 0: raise ValueError("uni_site_distro must be a non empty List") if not isinstance(sorted_pdist, np.ndarray): raise ValueError("sorted_p_distro must be a Numpy ndarray") num_triples = len(triples) logger.info( "[distributing] university %s to sites: %s, with %s triples", self.uni_name, uni_site_distro, num_triples ) site_index = np.random.choice(uni_site_distro, num_triples, p=sorted_pdist) site_triples = defaultdict(list) for j, triple in enumerate(triples): site_triples[site_index[j]].append(triple) return site_triples
def genvoid2(): parser = argparse.ArgumentParser( description='rdftools v{0}, generate a VoiD descriptor using the nxparser java package'.format(str_version)) parser.add_argument('source', metavar='SOURCE', type=str, help='the source file to be analized') parser.add_argument('--dataset_id', dest='dataset_id', action='store', type=str, default=None, help='dataset id') parser.add_argument('--use_nx', dest='use_nx', action='store_true', help='if true (default false) use the nx parser builtin void generator') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using rdftools version {0}'.format(str_version)) else: void_generator = VoIDGenNX() if args.use_nx else VoIDGenScala() void_generator(args.source, args.dataset_id) logger.info('done')
def genlubmdistro(): parser = argparse.ArgumentParser( description='rdftools v{0}, lubm dataset generator wrapper (bundled) - requires java'.format(str_version)) parser.add_argument('output', metavar='OUTPUT', type=str, help='the location in which to save the generated distributions') parser.add_argument('--distro', dest='distro', action='store', type=str, default='uni2one', help='the distibution to use, valid values are %s' % Distros.keys()) parser.add_argument('--univ', dest='univ', action='store', type=long, default=1, help='number of universities to generate') parser.add_argument('--index', dest='index', action='store', type=long, default=0, help='start university') parser.add_argument('--seed', dest='seed', action='store', type=long, default=0, help='the seed') parser.add_argument('--ontology', dest='ontology', action='store', type=str, default=None, help='the lubm ontology') parser.add_argument('--pdist', dest='pdist', action='store', type=str, default=None, help='the probabilities used for the uni2many distribution, valid choices are {0} or file ' 'with probabilities split by line'.format(DISTRIBUTIONS.keys())) parser.add_argument('--sites', dest='sites', action='store', type=long, default=1, help='the number of sites') parser.add_argument('--clean', dest='clean', action='store_true', help='delete the generated universities') parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1, help='the number of workers (default -1 : all cpus)') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using rdftools version {0}'.format(str_version)) else: logger.info('setup distro runner') _DistributionClass = Distros[args.distro] if not issubclass(_DistributionClass, LubmGenerator): raise ValueError('_DistributionClass must be a LubmGenerator') pdist = DISTRIBUTIONS.get(args.pdist, None) if not pdist: try: with open(args.dist, 'r+') as PDIST_FILE: pdist = np.array(map(float, PDIST_FILE.readlines())) except Exception: logger.error('failed to read distribution from {0}'.format(args.dist)) distro = _DistributionClass(args.output, args.sites, universities=args.univ, index=args.index, clean=args.clean, workers=args.workers, pdist=pdist) logger.info('run distribution process') distro() logger.info('done')
def rdfconvert2(): parser = argparse.ArgumentParser( description='rdftools v{0}, rdf converter (2), makes use of rdf2rdf bundled - requires java'.format( str_version)) parser.add_argument('source', metavar='SOURCE', type=str, help='the source file or location (of files) to be converted') parser.add_argument('--clear', dest='clear', action='store_true', help='clear the original files (delete) - this action is permanent, use with caution!') parser.add_argument('--dst_format', dest='dst_format', action='store', type=str, default='ntriples', help='the destination format to convert to') parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1, help='the number of workers (default -1 : all cpus)') parser.add_argument('--version', dest='version', action='store_true', help='the current version') args = parser.parse_args() if args.version: logger.info('using rdftools version {0}'.format(str_version)) else: rdf_converter = Rdf2Rdf() rdf_converter(args.source, args.dst_format, clear_source=args.clear, workers=args.workers) logger.info('done')