Exemple #1
0
def rdfconvert():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, rdf converter, based on libraptor2'.format(str_version))

    parser.add_argument('source', metavar='SOURCE', type=str,
                        help='the source file or location (of files) to be converted')
    parser.add_argument('--clear', dest='clear', action='store_true',
                        help='clear the original files (delete) - this action is permanent, use with caution!')
    parser.add_argument('--dst_format', dest='dst_format', action='store', type=str, default='ntriples',
                        help='the destination format to convert to. Supported parsers: %s. Supported serializers %s.' % (
                            pformat(RaptorRdf.parsers), pformat(RaptorRdf.serializers)))
    parser.add_argument('--buffer_size', dest='buffer_size', action='store', type=long, default=64,
                        help='the buffer size in Mb of the input buffer (the parser will only parse XX Mb at a time)')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using version {0}'.format(str_version))
    else:
        rdf_converter = RaptorRdf()
        rdf_converter(args.source, destination_format=args.dst_format, buffer_size=args.buffer_size,
                      clear=args.clear)
        logger.info('done')
Exemple #2
0
def genlubm():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, lubm dataset generator wrapper (bundled) - requires java'.format(str_version))

    parser.add_argument('output', metavar='OUTPUT', type=str,
                        help='the location in which to save the generated distributions')
    parser.add_argument('--univ', dest='univ', action='store', type=long, default=1,
                        help='number of universities to generate')
    parser.add_argument('--index', dest='index', action='store', type=long, default=0,
                        help='start university')
    parser.add_argument('--seed', dest='seed', action='store', type=long, default=0,
                        help='the seed')
    parser.add_argument('--ontology', dest='ontology', action='store', type=str, default=None,
                        help='the lubm ontology')
    parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1,
                        help='the number of workers (default -1 : all cpus)')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using rdftools version {0}'.format(str_version))
    else:
        lubm_generator = Lubm(ontology=args.ontology, path=args.output)
        lubm_generator(args.univ, args.index, args.seed, workers=args.workers)
        logger.info('done')
Exemple #3
0
    def _generate(self, **kwargs):
        uni_key = 'University'
        uni_ext = '.nt'
        get_uni_id = lambda uni_file: int(uni_file.replace(uni_key, '').replace(uni_ext, '').strip())

        universities_rdf = {
            get_uni_id(f): os.path.join(self.output_path, f)
            for f in os.listdir(self.output_path)
            if f.startswith(uni_key)
        }

        pool = Pool(processes=self.num_workers)
        for uni_id, uni_rdf in universities_rdf.iteritems():
            pool.apply_async(self.distributor(uni_id, uni_rdf),
                             kwds=self._distributor_kwargs(uni_id, uni_rdf))
        pool.close()
        pool.join()

        # concat files
        site_files = lambda site_id: re.findall(r'site_{0}_uni_[0-9]+\.nt'.format(site_id),
                                                ' '.join(os.listdir(self._output_path)))
        for site in xrange(self.num_sites):
            site_parts = site_files(site)
            logger.info('[site = %s] site file parts = %s', site, site_parts)

            with io.open(self.site_path(site), 'w+') as SITE:
                for spart in site_parts:
                    spart_file = os.path.join(self._output_path, spart)
                    with io.open(spart_file, 'r+') as SPART:
                        SITE.write(SPART.read())
                    sh.rm(spart_file)
    def _distribute_triples(self, triples, permutation='s'):
        logger.info('[distributing] university %s by %s', self.uni_name, permutation)
        site_index = HashPartitioner(self.uni_rdf, num_sites=self.num_sites, permutation=permutation)()

        site_triples = defaultdict(list)

        sites = [0 for i in xrange(self.num_sites)]
        for i, triple in enumerate(triples):
            sites[site_index[i]] += 1
            site_triples[site_index[i]].append(triple)
        logger.info('university %s total triples = %s, distribution = %s', self.uni_rdf, len(triples), sites)

        return site_triples
Exemple #5
0
def rdfencode():
    parser = argparse.ArgumentParser(description='rdftools v{0}, encode the RDF file(s)'.format(str_version))

    parser.add_argument('source', metavar='SOURCE', type=str,
                        help='the source file or location (of files) to be encoded')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using version {0}'.format(str_version))
    else:
        encoder = RdfEncoder(args.source)
        encoder()
        logger.info('done')
Exemple #6
0
def genvoid():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, generate void statistics for RDF files'.format(str_version))

    parser.add_argument('source', metavar='SOURCE', type=str,
                        help='the source file to be analized')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using version {0}'.format(str_version))
    else:
        void_generator = VoIDGen(args.source)
        stats = void_generator()
        logger.info('Collected Statistics (VoID): \n{0}'.format(pformat(stats)))
Exemple #7
0
    def _distribute_triples(self, triples, uni_site_distro=None, sorted_pdist=None):
        if not isinstance(uni_site_distro, list) and len(uni_site_distro) > 0:
            raise ValueError("uni_site_distro must be a non empty List")
        if not isinstance(sorted_pdist, np.ndarray):
            raise ValueError("sorted_p_distro must be a Numpy ndarray")

        num_triples = len(triples)
        logger.info(
            "[distributing] university %s to sites: %s, with %s triples", self.uni_name, uni_site_distro, num_triples
        )
        site_index = np.random.choice(uni_site_distro, num_triples, p=sorted_pdist)

        site_triples = defaultdict(list)
        for j, triple in enumerate(triples):
            site_triples[site_index[j]].append(triple)

        return site_triples
Exemple #8
0
def genvoid2():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, generate a VoiD descriptor using the nxparser java package'.format(str_version))

    parser.add_argument('source', metavar='SOURCE', type=str,
                        help='the source file to be analized')
    parser.add_argument('--dataset_id', dest='dataset_id', action='store', type=str, default=None,
                        help='dataset id')
    parser.add_argument('--use_nx', dest='use_nx', action='store_true',
                        help='if true (default false) use the nx parser builtin void generator')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using rdftools version {0}'.format(str_version))
    else:
        void_generator = VoIDGenNX() if args.use_nx else VoIDGenScala()
        void_generator(args.source, args.dataset_id)
        logger.info('done')
Exemple #9
0
def genlubmdistro():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, lubm dataset generator wrapper (bundled) - requires java'.format(str_version))

    parser.add_argument('output', metavar='OUTPUT', type=str,
                        help='the location in which to save the generated distributions')
    parser.add_argument('--distro', dest='distro', action='store', type=str, default='uni2one',
                        help='the distibution to use, valid values are %s' % Distros.keys())
    parser.add_argument('--univ', dest='univ', action='store', type=long, default=1,
                        help='number of universities to generate')
    parser.add_argument('--index', dest='index', action='store', type=long, default=0,
                        help='start university')
    parser.add_argument('--seed', dest='seed', action='store', type=long, default=0,
                        help='the seed')
    parser.add_argument('--ontology', dest='ontology', action='store', type=str, default=None,
                        help='the lubm ontology')
    parser.add_argument('--pdist', dest='pdist', action='store', type=str, default=None,
                        help='the probabilities used for the uni2many distribution, valid choices are {0} or file '
                             'with probabilities split by line'.format(DISTRIBUTIONS.keys()))
    parser.add_argument('--sites', dest='sites', action='store', type=long, default=1,
                        help='the number of sites')
    parser.add_argument('--clean', dest='clean', action='store_true',
                        help='delete the generated universities')
    parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1,
                        help='the number of workers (default -1 : all cpus)')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using rdftools version {0}'.format(str_version))
    else:
        logger.info('setup distro runner')
        _DistributionClass = Distros[args.distro]
        if not issubclass(_DistributionClass, LubmGenerator):
            raise ValueError('_DistributionClass must be a LubmGenerator')
        pdist = DISTRIBUTIONS.get(args.pdist, None)
        if not pdist:
            try:
                with open(args.dist, 'r+') as PDIST_FILE:
                    pdist = np.array(map(float, PDIST_FILE.readlines()))
            except Exception:
                logger.error('failed to read distribution from {0}'.format(args.dist))
        distro = _DistributionClass(args.output, args.sites, universities=args.univ, index=args.index, clean=args.clean,
                                    workers=args.workers, pdist=pdist)
        logger.info('run distribution process')
        distro()
        logger.info('done')
Exemple #10
0
def rdfconvert2():
    parser = argparse.ArgumentParser(
        description='rdftools v{0}, rdf converter (2), makes use of rdf2rdf bundled - requires java'.format(
            str_version))

    parser.add_argument('source', metavar='SOURCE', type=str,
                        help='the source file or location (of files) to be converted')
    parser.add_argument('--clear', dest='clear', action='store_true',
                        help='clear the original files (delete) - this action is permanent, use with caution!')
    parser.add_argument('--dst_format', dest='dst_format', action='store', type=str, default='ntriples',
                        help='the destination format to convert to')
    parser.add_argument('--workers', dest='workers', action='store', type=int, default=-1,
                        help='the number of workers (default -1 : all cpus)')
    parser.add_argument('--version', dest='version', action='store_true',
                        help='the current version')

    args = parser.parse_args()

    if args.version:
        logger.info('using rdftools version {0}'.format(str_version))
    else:
        rdf_converter = Rdf2Rdf()
        rdf_converter(args.source, args.dst_format, clear_source=args.clear, workers=args.workers)
        logger.info('done')