Example #1
0
    def __init__(self):
        self.cache = LodstatsDebianCache()
        self.dm2000 = DatasetManipulator2000()
        self.datasets = self.dm2000.getPackages()
        #filters
        self.filtering = DatasetFiltering(self.datasets)
        self.datasets = self.filtering.applyFilters()

        self.datasetConfigs = self.generateConfigs(self.datasets)
        self.creator = DebianPackageCreator()
        self.creator.createDatasets(self.datasets, self.datasetConfigs)
Example #2
0
class DatasetManipulator2000(object):
    def __init__(self):
        self.cache = LodstatsDebianCache()

    def getPackages(self):
        resources = self.cache.loadResourcesFromCache()
        datasets = dict()
        for num, resource in enumerate(resources):
            if (not hasattr(resource, 'package_name')):
                continue
            if (datasets.get(resource.package_name, False)):
                datasets[resource.package_name].append(resource)
            else:
                datasets[resource.package_name] = [resource]
        return datasets
Example #3
0
class LodstatsDebian(object):
    def __init__(self):
        self.cache = LodstatsDebianCache()
        self.dm2000 = DatasetManipulator2000()
        self.datasets = self.dm2000.getPackages()
        #filters
        self.filtering = DatasetFiltering(self.datasets)
        self.datasets = self.filtering.applyFilters()

        self.datasetConfigs = self.generateConfigs(self.datasets)
        self.creator = DebianPackageCreator()
        self.creator.createDatasets(self.datasets, self.datasetConfigs)

    def generateConfigs(self, datasets):
        configs = {}
        for dataset in datasets:
            package = self.cache.getDataset(dataset)
            config = CkanDebianConfig(package, datasets[dataset])
            configs[dataset] = config
        return configs
Example #4
0
 def __init__(self):
     self.cache = LodstatsDebianCache()
     pass
Example #5
0
class DebianPackageCreator(object):
    datasetsFolder = 'datasets'

    def __init__(self):
        self.cache = LodstatsDebianCache()
        pass

    def createDatasets(self, datasets, configs):
        for dataset in datasets:
            logger.info("Creating debian config for %s" % dataset)
            datasetFolder = os.path.join(self.datasetsFolder, dataset)
            if not os.path.exists(datasetFolder):
                os.makedirs(datasetFolder)

            datasetConfig = os.path.join(datasetFolder,
                                         configs[dataset].datasetId + ".cfg")
            if os.path.isfile(datasetConfig):
                logger.info("debian config already exists, skipping")
                continue

            ###Download dataset RDF data
            try:
                r = requests.get(datasets[dataset].url, stream=True, timeout=1)
                datasetFilepath = os.path.join(datasetFolder,
                                               configs[dataset].datasetId)
                with open(datasetFilepath, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                            f.flush()
            except BaseException as e:
                logger.error(str(e))
                logger.error("Could not download dataset, skipping")
                continue
            #if archive - extract all, convert all to ntriples and merge into one file
            #convert to ntriples and append .nt extension
            rdf2rdf = RDF2RDF()
            inputFile = datasetFilepath
            inputFormat = datasets[dataset].format
            outputFile = datasetFilepath + ".nt"
            (noConversion, extension,
             withErrors) = rdf2rdf.convert_to_ntriples(inputFile, inputFormat,
                                                       outputFile)

            if (withErrors):
                logger.error("Dataset contains errors, skipping")
                shutil.rmtree(datasetFolder)
                continue

            if (noConversion):
                #File is already in nt or n3 format
                os.rename(datasetFilepath, datasetFilepath + extension)
            else:
                #remove old file
                os.remove(datasetFilepath)
            #pack with bz2
            self._compressFile(outputFile)

            ###Metadata processing
            datasetVoidTtl = datasets[dataset].void
            datasetMetaRdf = self.cache.getRdfMetadata(dataset)
            mergedMetadataString = self.mergeMetadata(dataset, datasetMetaRdf,
                                                      datasetVoidTtl)
            datasetMetaFilepath = os.path.join(
                datasetFolder, configs[dataset].datasetId + ".meta.ttl")
            with open(datasetMetaFilepath, 'wb') as f:
                f.write(mergedMetadataString)
                f.flush()
            #compress with bz2
            self._compressFile(datasetMetaFilepath)

            ###Write config to file
            configFilepath = os.path.join(datasetFolder,
                                          configs[dataset].datasetId + ".cfg")
            with open(configFilepath, 'wb') as f:
                f.write(configs[dataset].toString())
                f.flush()
        logger.info("package creation complete!")

    def _compressFile(self, filename):
        command = "bzip2 " + filename
        subprocess.Popen(command, shell=True)

    def mergeMetadata(self, dataset, dataset_meta_rdf, dataset_void_ttl):
        rdfparser = RdfParser()
        base_uri = "http://datahub,io/dataset/" + dataset
        dataset_meta_rdf_stream = rdfparser.init_stream_from_string(
            dataset_meta_rdf, base_uri, parser_name="rdfxml")
        dataset_meta_ttl_stream = rdfparser.init_stream_from_string(
            dataset_void_ttl, base_uri, parser_name="turtle")
        return rdfparser.merge_two_streams(dataset_meta_rdf_stream,
                                           dataset_meta_ttl_stream)
Example #6
0
 def __init__(self, datasets):
     self.datasets = datasets
     self.cache = LodstatsDebianCache()
Example #7
0
class DatasetFiltering(object):

    def __init__(self, datasets):
        self.datasets = datasets
        self.cache = LodstatsDebianCache()

    def applyFilters(self):
        for method in dir(self):
            if(method.startswith('filter')):
                exec("self.datasets = self.%s(self.datasets)"%(method,))
        return self.datasets

    def filterDuplicateResources(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        for dataset in datasets_copy:
            if(len(datasets_copy[dataset]) > 1):
                for num_fixed, res_fixed in enumerate(datasets_copy[dataset]):
                    if(res_fixed == {}):
                        continue
                    for num_iter, res_iter in enumerate(datasets_copy[dataset]):
                        if(res_iter == {}):
                            continue
                        if(num_fixed != num_iter and res_fixed.url == res_iter.url):
                            datasets_copy[dataset][num_iter] = {}

        #wipe empty resources
        datasets_wiped = self._purgeResources(datasets_copy)
        datasets_purged = self._purgeDatasets(datasets_wiped)

        return datasets_purged

    def filterMoreThanOneDump(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        list_to_delete = []
        for dataset in datasets_copy:
            if(len(datasets_copy[dataset]) > 1):
                for num_fixed, res_fixed in enumerate(datasets_copy[dataset]):
                    if(res_fixed != {}):
                        for num_iter, res_iter in enumerate(datasets_copy[dataset]):
                            if(res_iter != {}):
                                if(num_fixed != num_iter and res_fixed.format == res_iter.format):
                                    if(dataset not in list_to_delete):
                                        list_to_delete.append(dataset)

        for dataset in list_to_delete:
            del(datasets_copy[dataset])

        return datasets_copy

    def filterMoreThanMillionTriples(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        list_to_delete = []
        for dataset in datasets_copy:
            for resource in datasets_copy[dataset]:
                if(resource != {} and resource.triples > 1000000):
                    if(dataset not in list_to_delete):
                        list_to_delete.append(dataset)

        for dataset in list_to_delete:
            del(datasets_copy[dataset])

        return datasets_copy

    def filterDatasetsNoFreeLicenses(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        list_to_delete = []

        for dataset in datasets_copy:
            #Try to load from cache
            package = self.cache.getDataset(dataset)
            if(not package.isopen):
                list_to_delete.append(dataset)

        for dataset in list_to_delete:
            del(datasets_copy[dataset])

        return datasets_copy

    def filterMetaExamplesApis(self, datasets):
        """
            Filtering out archives for now
        """
        datasets_copy = copy.deepcopy(datasets)

        for dataset in datasets_copy:
            for num, resource in enumerate(datasets_copy[dataset]):
                if(re.match( r'api', resource.format, re.M|re.I) or
                   re.match( r'example', resource.format, re.M|re.I) or
                   re.match( r'meta', resource.format, re.M|re.I) or
                   re.match( r'owl', resource.format, re.M|re.I) or
                   re.match( r'ravensburg-local-shopping-graph', dataset, re.M|re.I) or
                   re.match( r'html', resource.format, re.M|re.I) or
                   re.match( r'.*\.gz', resource.url, re.M|re.I) or
                   re.match( r'.*\.tgz', resource.url, re.M|re.I) or
                   re.match( r'.*\.zip', resource.url, re.M|re.I) or
                   re.match( r'http://lov.okfn.org/dataset/lov', resource.url, re.M|re.I) or
                   re.match( r'http://www.ontosearch.com/', resource.url, re.M|re.I)):
                    datasets_copy[dataset][num] = {}

        datasets_wiped = self._purgeResources(datasets_copy)
        return datasets_wiped

    def filterOneResource(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        for dataset in datasets_copy:
            for resource in datasets_copy[dataset]:
                if(re.match(r'ntriple', resource.format)):
                    datasets_copy[dataset] = [resource]
                    break

        for dataset in datasets_copy:
            for resource in datasets_copy[dataset]:
                datasets_copy[dataset] = resource
                break

        return datasets_copy

    def _purgeResources(self, datasets):
        datasets_wiped = {}
        for dataset in datasets:
            for resource in self._ifilter(lambda x: x == {}, datasets[dataset]):
                if(datasets_wiped.get(resource.package_name, False)):
                    datasets_wiped[resource.package_name].append(resource)
                else:
                    datasets_wiped[resource.package_name] = [resource]
        return datasets_wiped

    def _purgeDatasets(self, datasets):
        datasets_copy = copy.deepcopy(datasets)
        to_purge = []
        for dataset in datasets_copy:
            if(len(datasets_copy[dataset]) == 0):
                to_purge.append(dataset)

        for dataset in to_purge:
            del(datasets_copy[dataset])

        return datasets_copy

    def _ifilter(self, predicate, iterable):
        if predicate is None:
            predicate = bool
        for x in iterable:
            if not predicate(x):
                yield x