コード例 #1
0
def demo_0():
    # prixgoncourt is the list of Goncourt Prize, extracted
    # from wikipedia

    #We try to align Goncourt winers onto dbpedia results

    query = """
       SELECT ?writer, ?name WHERE {
          ?writer  <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>.
          ?writer rdfs:label ?name.
          FILTER(lang(?name) = 'fr')
       }
    """

    print "Sending query to dbpedia"
    targetset = sparqlquery('http://dbpedia.org/sparql', query)
    print "Reading the prixgoncourt file"
    alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])

    tr_name = {'normalization': [lambda x:remove_after(x, '('),
                                 aln.simplify],
               'metric': ald.levenshtein
              }

    processings = {1: tr_name}

    print "Alignment started"
    align(alignset, targetset, 0.4, processings,
          dpath('demo0_results'))

    print "Done, see the resuls in %s" % dpath('demo0_results')
コード例 #2
0
def demo_1():
    # FR.txt is an extract of geonames, where locations have been sorted by name
    # frenchbnf is an extract of french BNF's locations, sorted by name too

    # For each line (ie location) we keep the identifier, the name and the
    # position (longitude, latitude)
    # ``nbmax`` is the number of locations to load

    print "Parsing the input files"
    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
                          nbmax=2000)
    alignset = parsefile(dpath('frenchbnf'),
                         indexes=[0, 2, (14, 12)], nbmax=1000)


    # Let's define the processings to apply on the location's name
    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
                                              #   punctuation, lower case, etc)
               'metric': ald.levenshtein,       # Use the levenshtein distance
               'weighting': 1                 # Use 1 a name-distance matrix
                                              #   weighting coefficient
              }
    tr_geo = {'normalization': [],              # No normalization needed
              'metric': ald.geographical,         # Use the geographical distance
              'metric_params': {'units': 'km'},# Arguments given the
                                                #   distance function. Here,
                                                #   the unit to use
              'weighting': 1
             }

    processings = {1: tr_name, 2: tr_geo}

    print "Alignment started"
    align(alignset,           # The dataset to align
          targetset,          # The target dataset
          0.4,                # The maximal distance
                              #   threshold
          processings,         # The list of processings to
                              #   apply.
          dpath('demo1_results'))
                              # Filename of the output
                              #   result file
    # the ``align()`` function return two items
    # 0. the computed distance matrix
    # 1. a boolean, True if at least one alignment has been done, False
    #    otherwise
    print "Done, see the results in %s" % dpath('demo1_results')