def __init__(self): self.cache = LodstatsDebianCache() self.dm2000 = DatasetManipulator2000() self.datasets = self.dm2000.getPackages() #filters self.filtering = DatasetFiltering(self.datasets) self.datasets = self.filtering.applyFilters() self.datasetConfigs = self.generateConfigs(self.datasets) self.creator = DebianPackageCreator() self.creator.createDatasets(self.datasets, self.datasetConfigs)
class DatasetManipulator2000(object): def __init__(self): self.cache = LodstatsDebianCache() def getPackages(self): resources = self.cache.loadResourcesFromCache() datasets = dict() for num, resource in enumerate(resources): if (not hasattr(resource, 'package_name')): continue if (datasets.get(resource.package_name, False)): datasets[resource.package_name].append(resource) else: datasets[resource.package_name] = [resource] return datasets
class LodstatsDebian(object): def __init__(self): self.cache = LodstatsDebianCache() self.dm2000 = DatasetManipulator2000() self.datasets = self.dm2000.getPackages() #filters self.filtering = DatasetFiltering(self.datasets) self.datasets = self.filtering.applyFilters() self.datasetConfigs = self.generateConfigs(self.datasets) self.creator = DebianPackageCreator() self.creator.createDatasets(self.datasets, self.datasetConfigs) def generateConfigs(self, datasets): configs = {} for dataset in datasets: package = self.cache.getDataset(dataset) config = CkanDebianConfig(package, datasets[dataset]) configs[dataset] = config return configs
def __init__(self): self.cache = LodstatsDebianCache() pass
class DebianPackageCreator(object): datasetsFolder = 'datasets' def __init__(self): self.cache = LodstatsDebianCache() pass def createDatasets(self, datasets, configs): for dataset in datasets: logger.info("Creating debian config for %s" % dataset) datasetFolder = os.path.join(self.datasetsFolder, dataset) if not os.path.exists(datasetFolder): os.makedirs(datasetFolder) datasetConfig = os.path.join(datasetFolder, configs[dataset].datasetId + ".cfg") if os.path.isfile(datasetConfig): logger.info("debian config already exists, skipping") continue ###Download dataset RDF data try: r = requests.get(datasets[dataset].url, stream=True, timeout=1) datasetFilepath = os.path.join(datasetFolder, configs[dataset].datasetId) with open(datasetFilepath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() except BaseException as e: logger.error(str(e)) logger.error("Could not download dataset, skipping") continue #if archive - extract all, convert all to ntriples and merge into one file #convert to ntriples and append .nt extension rdf2rdf = RDF2RDF() inputFile = datasetFilepath inputFormat = datasets[dataset].format outputFile = datasetFilepath + ".nt" (noConversion, extension, withErrors) = rdf2rdf.convert_to_ntriples(inputFile, inputFormat, outputFile) if (withErrors): logger.error("Dataset contains errors, skipping") shutil.rmtree(datasetFolder) continue if (noConversion): #File is already in nt or n3 format os.rename(datasetFilepath, datasetFilepath + extension) else: #remove old file os.remove(datasetFilepath) #pack with bz2 self._compressFile(outputFile) ###Metadata processing datasetVoidTtl = datasets[dataset].void datasetMetaRdf = self.cache.getRdfMetadata(dataset) mergedMetadataString = self.mergeMetadata(dataset, datasetMetaRdf, datasetVoidTtl) datasetMetaFilepath = os.path.join( datasetFolder, configs[dataset].datasetId + ".meta.ttl") with open(datasetMetaFilepath, 'wb') as f: f.write(mergedMetadataString) f.flush() #compress with bz2 self._compressFile(datasetMetaFilepath) ###Write config to file configFilepath = os.path.join(datasetFolder, configs[dataset].datasetId + ".cfg") with open(configFilepath, 'wb') as f: f.write(configs[dataset].toString()) f.flush() logger.info("package creation complete!") def _compressFile(self, filename): command = "bzip2 " + filename subprocess.Popen(command, shell=True) def mergeMetadata(self, dataset, dataset_meta_rdf, dataset_void_ttl): rdfparser = RdfParser() base_uri = "http://datahub,io/dataset/" + dataset dataset_meta_rdf_stream = rdfparser.init_stream_from_string( dataset_meta_rdf, base_uri, parser_name="rdfxml") dataset_meta_ttl_stream = rdfparser.init_stream_from_string( dataset_void_ttl, base_uri, parser_name="turtle") return rdfparser.merge_two_streams(dataset_meta_rdf_stream, dataset_meta_ttl_stream)
def __init__(self, datasets): self.datasets = datasets self.cache = LodstatsDebianCache()
class DatasetFiltering(object): def __init__(self, datasets): self.datasets = datasets self.cache = LodstatsDebianCache() def applyFilters(self): for method in dir(self): if(method.startswith('filter')): exec("self.datasets = self.%s(self.datasets)"%(method,)) return self.datasets def filterDuplicateResources(self, datasets): datasets_copy = copy.deepcopy(datasets) for dataset in datasets_copy: if(len(datasets_copy[dataset]) > 1): for num_fixed, res_fixed in enumerate(datasets_copy[dataset]): if(res_fixed == {}): continue for num_iter, res_iter in enumerate(datasets_copy[dataset]): if(res_iter == {}): continue if(num_fixed != num_iter and res_fixed.url == res_iter.url): datasets_copy[dataset][num_iter] = {} #wipe empty resources datasets_wiped = self._purgeResources(datasets_copy) datasets_purged = self._purgeDatasets(datasets_wiped) return datasets_purged def filterMoreThanOneDump(self, datasets): datasets_copy = copy.deepcopy(datasets) list_to_delete = [] for dataset in datasets_copy: if(len(datasets_copy[dataset]) > 1): for num_fixed, res_fixed in enumerate(datasets_copy[dataset]): if(res_fixed != {}): for num_iter, res_iter in enumerate(datasets_copy[dataset]): if(res_iter != {}): if(num_fixed != num_iter and res_fixed.format == res_iter.format): if(dataset not in list_to_delete): list_to_delete.append(dataset) for dataset in list_to_delete: del(datasets_copy[dataset]) return datasets_copy def filterMoreThanMillionTriples(self, datasets): datasets_copy = copy.deepcopy(datasets) list_to_delete = [] for dataset in datasets_copy: for resource in datasets_copy[dataset]: if(resource != {} and resource.triples > 1000000): if(dataset not in list_to_delete): list_to_delete.append(dataset) for dataset in list_to_delete: del(datasets_copy[dataset]) return datasets_copy def filterDatasetsNoFreeLicenses(self, datasets): datasets_copy = copy.deepcopy(datasets) list_to_delete = [] for dataset in datasets_copy: #Try to load from cache package = self.cache.getDataset(dataset) if(not package.isopen): list_to_delete.append(dataset) for dataset in list_to_delete: del(datasets_copy[dataset]) return datasets_copy def filterMetaExamplesApis(self, datasets): """ Filtering out archives for now """ datasets_copy = copy.deepcopy(datasets) for dataset in datasets_copy: for num, resource in enumerate(datasets_copy[dataset]): if(re.match( r'api', resource.format, re.M|re.I) or re.match( r'example', resource.format, re.M|re.I) or re.match( r'meta', resource.format, re.M|re.I) or re.match( r'owl', resource.format, re.M|re.I) or re.match( r'ravensburg-local-shopping-graph', dataset, re.M|re.I) or re.match( r'html', resource.format, re.M|re.I) or re.match( r'.*\.gz', resource.url, re.M|re.I) or re.match( r'.*\.tgz', resource.url, re.M|re.I) or re.match( r'.*\.zip', resource.url, re.M|re.I) or re.match( r'http://lov.okfn.org/dataset/lov', resource.url, re.M|re.I) or re.match( r'http://www.ontosearch.com/', resource.url, re.M|re.I)): datasets_copy[dataset][num] = {} datasets_wiped = self._purgeResources(datasets_copy) return datasets_wiped def filterOneResource(self, datasets): datasets_copy = copy.deepcopy(datasets) for dataset in datasets_copy: for resource in datasets_copy[dataset]: if(re.match(r'ntriple', resource.format)): datasets_copy[dataset] = [resource] break for dataset in datasets_copy: for resource in datasets_copy[dataset]: datasets_copy[dataset] = resource break return datasets_copy def _purgeResources(self, datasets): datasets_wiped = {} for dataset in datasets: for resource in self._ifilter(lambda x: x == {}, datasets[dataset]): if(datasets_wiped.get(resource.package_name, False)): datasets_wiped[resource.package_name].append(resource) else: datasets_wiped[resource.package_name] = [resource] return datasets_wiped def _purgeDatasets(self, datasets): datasets_copy = copy.deepcopy(datasets) to_purge = [] for dataset in datasets_copy: if(len(datasets_copy[dataset]) == 0): to_purge.append(dataset) for dataset in to_purge: del(datasets_copy[dataset]) return datasets_copy def _ifilter(self, predicate, iterable): if predicate is None: predicate = bool for x in iterable: if not predicate(x): yield x