class QueryCache(): def __init__(self, config): self._conf = config self._sparql = SPARQLWrap(config) self.log = config.getLogger('QueryCache') def go(self): output = {} data = requests.get(GIST_URL).json() for (name, content) in data['files'].iteritems(): if name.endswith('.rq'): self.log.info("Running " + name) results = self._sparql.run_select(content['content']) output[name] = results return output
class GenerateLists: """ This object is used to generate lists of header strings extracted from the raw data """ def __init__(self, config): """ Constructor """ self._conf = config self._sparql = SPARQLWrap(config) self._row_property_seeds = set() self.log = config.getLogger("GenerateLists") self._sparql_queries = [] # self._sparql_queries.append(open('../Queries/find_column_headers.rq','r').read()) self._sparql_row_property = open("../Queries/find_row_headers.rq", "r").read() def add_seed_row_property(self, string): """ Add a string to the list of strings to look for in row properties """ return self._row_property_seeds.add(string) def remove_seed_row_property(self, string): """ Remove a string from the list of strings to look for in row properties """ return self._row_property_seeds.remove(string) def go(self): """ Go over all the seeds and store the output """ output = [] for seed in self._row_property_seeds: print "Query for %s" % seed query = "%s" % self._sparql_row_property query = query.replace("__seed_text__", seed) results = self._sparql.run_select(query) distinct = set() print "Number of results: %d" % len(results) for result in results: dataset = result["ds"]["value"] value = result["value"]["value"] if dataset + value not in distinct: output.append([dataset, value.replace("\n", "").encode("utf8", "replace")]) distinct.add(dataset + value) return output
class GenerateLists(): ''' This object is used to generate lists of header strings extracted from the raw data ''' def __init__(self, config): ''' Constructor ''' self._conf = config self._sparql = SPARQLWrap(config) self._row_property_seeds = set() self.log = config.getLogger('GenerateLists') self._sparql_queries = [] # self._sparql_queries.append(open('../Queries/find_column_headers.rq','r').read()) self._sparql_row_property = open('../Queries/find_row_headers.rq', 'r').read() def add_seed_row_property(self, string): ''' Add a string to the list of strings to look for in row properties ''' return self._row_property_seeds.add(string) def remove_seed_row_property(self, string): ''' Remove a string from the list of strings to look for in row properties ''' return self._row_property_seeds.remove(string) def go(self): ''' Go over all the seeds and store the output ''' output = [] for seed in self._row_property_seeds: print 'Query for %s' % seed query = "%s" % self._sparql_row_property query = query.replace('__seed_text__', seed) results = self._sparql.run_select(query) distinct = set() print 'Number of results: %d' % len(results) for result in results: dataset = result['ds']['value'] value = result['value']['value'] if dataset + value not in distinct: output.append([dataset, value.replace('\n', '').encode('utf8', 'replace')]) distinct.add(dataset + value) return output