def demo_0(): # prixgoncourt is the list of Goncourt Prize, extracted # from wikipedia #We try to align Goncourt winers onto dbpedia results query = """ SELECT ?writer, ?name WHERE { ?writer <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>. ?writer rdfs:label ?name. FILTER(lang(?name) = 'fr') } """ print "Sending query to dbpedia" targetset = sparqlquery('http://dbpedia.org/sparql', query) print "Reading the prixgoncourt file" alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1]) tr_name = {'normalization': [lambda x:remove_after(x, '('), aln.simplify], 'metric': ald.levenshtein } processings = {1: tr_name} print "Alignment started" align(alignset, targetset, 0.4, processings, dpath('demo0_results')) print "Done, see the resuls in %s" % dpath('demo0_results')
def demo_1(): # FR.txt is an extract of geonames, where locations have been sorted by name # frenchbnf is an extract of french BNF's locations, sorted by name too # For each line (ie location) we keep the identifier, the name and the # position (longitude, latitude) # ``nbmax`` is the number of locations to load print "Parsing the input files" targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)], nbmax=2000) alignset = parsefile(dpath('frenchbnf'), indexes=[0, 2, (14, 12)], nbmax=1000) # Let's define the processings to apply on the location's name tr_name = {'normalization': [aln.simplify], # Simply all the names (remove # punctuation, lower case, etc) 'metric': ald.levenshtein, # Use the levenshtein distance 'weighting': 1 # Use 1 a name-distance matrix # weighting coefficient } tr_geo = {'normalization': [], # No normalization needed 'metric': ald.geographical, # Use the geographical distance 'metric_params': {'units': 'km'},# Arguments given the # distance function. Here, # the unit to use 'weighting': 1 } processings = {1: tr_name, 2: tr_geo} print "Alignment started" align(alignset, # The dataset to align targetset, # The target dataset 0.4, # The maximal distance # threshold processings, # The list of processings to # apply. dpath('demo1_results')) # Filename of the output # result file # the ``align()`` function return two items # 0. the computed distance matrix # 1. a boolean, True if at least one alignment has been done, False # otherwise print "Done, see the results in %s" % dpath('demo1_results')