def process_item(self, item, spider): keyid = item['keyid'] nationid = item['nationid'] answers = item['answers'] label = item['label'] data = item['data'] dataset_uri = "%s/dataset" % (BASE_URI) question_uri = "%s/question/%s" % (BASE_URI, keyid) triples = triple(question_uri, RDFS['label'], label) for v in range(1, len(answers)): triples += triple(EB['q%sa%s' % (keyid, v)], RDF['type'], QB['MeasureProperty']) triples += triple(EB['q%sa%s' % (keyid, v)], RDFS['label'], "%s (%%)" % answers[v]) country_uri = countries[str(nationid)]['uri'] for r in data: (month, year) = r[0].split('/') observation_uri = "%s/observation/%s/%s/%s/%s" % (BASE_URI, nationid, keyid, year, month) survey_uri = "%s/survey/%s/%s" % (BASE_URI, year, month) triples += triple(observation_uri, RDF['type'], QB['Observation']) triples += triple(observation_uri, RDFS['label'], 'All results from %s for "%s" in survey %s.%s' % (countries[str(nationid)]['label'], label, month, year)) triples += triple(observation_uri, QB['dataSet'], dataset_uri) triples += triple(observation_uri, EB['survey'], survey_uri) triples += triple(observation_uri, EB['surveyMonth'], int(month)) triples += triple(observation_uri, EB['surveyYear'], int(year)) triples += triple(observation_uri, EB['question'], question_uri) triples += triple(observation_uri, EB['country'], country_uri) for v in range(1, len(r)): m = re.search("^\s*(.+)%", r[v]) if m: triples += triple(observation_uri, EB['q%sa%s' % (keyid, v)], float(m.group(1))) survey_slice_uri = "%s/slice/%s/%s" % (BASE_URI, nationid, keyid) question_slice_uri = "%s/slice/%s/-/%s/%s" % (BASE_URI, nationid, year, month) country_slice_uri = "%s/slice/-/%s/%s/%s" % (BASE_URI, keyid, year, month) triples += triple(survey_slice_uri, RDFS['label'], 'All results from %s for "%s"' % (countries[str(nationid)]['label'], label)) triples += triple(survey_slice_uri, QB['observation'], observation_uri) triples += triple(survey_slice_uri, EB['question'], question_uri) triples += triple(survey_slice_uri, EB['country'], country_uri) triples += triple(question_slice_uri, RDFS['label'], "All results from %s in survey %s.%s" % ( countries[str(nationid)]['label'], month, year)) triples += triple(question_slice_uri, QB['observation'], observation_uri) triples += triple(question_slice_uri, EB['survey'], survey_uri) triples += triple(question_slice_uri, EB['country'], country_uri) triples += triple(country_slice_uri, RDFS['label'], 'All results for "%s" in survey %s.%s' % (label, month, year)) triples += triple(country_slice_uri, QB['observation'], observation_uri) triples += triple(country_slice_uri, EB['survey'], survey_uri) triples += triple(country_slice_uri, EB['question'], question_uri) triples += triple(dataset_uri, QB['slice'], survey_slice_uri) triples += triple(dataset_uri, QB['slice'], question_slice_uri) triples += triple(dataset_uri, QB['slice'], country_slice_uri) self.file.write(triples) triples = '' return item
fout.write("# Found %d distinct resources\n" % len(resources)) if opts.weights: fout.write("# Weights: %s\n" % ", ".join(opts.weights)) nummatches = opts.nummatches if nummatches > len(resources) - 1: nummatches = len(resources) - 1 std = numpy.std(distances) for r in resources: index = resources[r] try: node = opts.uripattern % hashlib.md5(r).hexdigest() fout.write(triple(r, "http://vocab.org/terms/similarThings", node)) fout.write(triple(node, "a", "http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq")) score_base = distances[index][sorted_distance_args[index][1]] for i in range(1, nummatches + 1): score = distances[index][sorted_distance_args[index][i]] if score > score_base + std: break # print "match %s has score %s (%s)" % (i, distances[index][sorted_distance_args[index][i]] , resource_index[sorted_distance_args[index][i]]) try: fout.write( triple( node, "http://www.w3.org/1999/02/22-rdf-syntax-ns#_%d" % (i), resource_index[sorted_distance_args[index][i]], ) )