コード例 #1
0
    def _get_dbpedia_nl_record(self, dbpedia_nl_identifier):
        if self.debug:
            self.log.info("getting.. %s " % (dbpedia_nl_identifier))
        if dbpedia_nl_identifier.lower().find("http") > -1:
            if dbpedia_nl_identifier.find('http:/') > -1:
                dbpedia_nl_identifier = dbpedia_nl_identifier.split(
                    '/')[-1].strip()
            elif dbpedia_nl_identifier.find('%2F') > -1:
                dbpedia_nl_identifier = dbpedia_nl_identifier.split(
                    '%2F')[-1].strip()
        if dbpedia_nl_identifier.lower().startswith(
                "dbp:") or dbpedia_nl_identifier.lower().startswith(
                    "dbpedia_nl:"):
            dbpedia_nl_identifier = dbpedia_nl_identifier.split(':')[0]

        if dbpedia_nl_identifier[0].islower():
            dbpedia_nl_identifier = dbpedia_nl_identifier.title()
        if dbpedia_nl_identifier.find(' ') > -1:
            dbpedia_nl_identifier = dbpedia_nl_identifier.replace(" ", "_")
        if dbpedia_nl_identifier.find('%20') > -1:
            dbpedia_nl_identifier = dbpedia_nl_identifier.replace("%20", "_")

        if dbpedia_nl_identifier.find('+') > -1 or dbpedia_nl_identifier.find(
                '_') > -1:
            q = ""
            for item in dbpedia_nl_identifier.split('+'):
                q += "+OR+prefLabel:" + item
            for item in dbpedia_nl_identifier.split('_'):
                q += "+OR+prefLabel:" + item
            url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), q)
        else:
            url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), "+OR+"
                                         + dbpedia_nl_identifier.strip())
        data = self.get(url)

        if not data:
            if self.debug:
                self.log.info("No DBPedia_nl data for: %s @ %s (via %s)" %
                              (dbpedia_nl_identifier, url, self.backend))
            return (False)
        else:
            if type(data) == str:
                data = simplejson.loads(data)
            if data["response"]["numFound"] > 0:
                identifier = data["response"]["docs"][0]["id"].split('/')[-1]
                record = DBPedia([identifier],
                                 backend=self.backend,
                                 log_path=self.log_path,
                                 debug=self.debug)
                record.execute()
                self[record.keys()[0]] = record[record.keys()[0]]
                s = self[record.keys()[0]].pop("sameAs")[0]
                rec = sameAs([s],
                             backend=self.backend,
                             log_path=self.log_path,
                             debug=self.debug)
                rec.execute()
                self[record.keys()[0]]["same"] = rec.values()[0]["same"]
コード例 #2
0
class PropertyMatcher(object):
    def __init__(self):
        self.dbpedia = DBPedia()
        self.synonyms_base = SynonymsBase()
        self.tokenizer = Tokenizer()

    def match_tokens_with_properties(self, resource, tokens):
        properties = self.dbpedia.get_properties(resource)
        word_values = defaultdict(list)
        filtered_tokens = self._filter_tokens(tokens, resource)
        for word in filtered_tokens:
            if word.lower() in resource.lower():
                continue
            synonyms = self.synonyms_base.find_synonyms(word, tokens)

            for synonym in self.tokenizer.clean_entities(synonyms):
                for prop in properties:
                    if prop == 'sameAs' or prop == 'wasDerivedFrom':
                        continue
                    if self._word_matches_property(synonym, prop):
                        values = self.dbpedia.get_value(resource, prop)
                        word_values[word].append((synonym, prop, values))
        confidence = self._calculate_confidence(filtered_tokens, word_values)
        return word_values, confidence

    def _word_matches_property(self, word, property):
        return word in property.lower()  # + levenshtein?

    def _calculate_confidence(self, tokens, word_values):
        wh_q = any([t.lower() in ['when', 'where'] for t in tokens])
        birth_death_q = any(
            [t.lower() in ['born', 'birth', 'die', 'died'] for t in tokens])
        if wh_q and birth_death_q:
            return 0.9

        if len(tokens) == 0:
            return 1.0

        return len(word_values.keys()) / len(tokens)

    def _filter_tokens(self, tokens, resource):
        return [
            w for w in self.tokenizer.clean_entities(tokens)
            if w.lower() not in resource.lower()
        ]
コード例 #3
0
ファイル: france.py プロジェクト: noirbizarre/geozones
def fetch_missing_data_from_dbpedia(db, filename):
    info('Fetching DBPedia data')
    processed = 0
    for zone in db.find({
            'wikipedia': {'$exists': True, '$ne': None},
            '$or': [
                {'population': None},
                {'population': {'$exists': False}},
                {'area': None},
                {'area': {'$exists': False}},
            ]
            }, no_cursor_timeout=True):

        dbpedia = DBPedia(zone['wikipedia'])
        metadata = {
            'dbpedia': dbpedia.resource_url,
        }
        metadata.update(dbpedia.fetch_population_or_area())
        metadata.update(dbpedia.fetch_flag_or_blazon())
        if db.find_one_and_update({'_id': zone['_id']},
                                  {'$set': metadata}):
            processed += 1
    success('Fetched DBPedia data for {0} zones'.format(processed))
コード例 #4
0
    def _get_dbpedia_nl_record(self, dbpedia_nl_identifier):
        if self.debug:
            self.log.info("getting.. %s " % (dbpedia_nl_identifier))
        if dbpedia_nl_identifier.lower().find("http") > -1:
            if dbpedia_nl_identifier.find('http:/') > -1:
                dbpedia_nl_identifier = dbpedia_nl_identifier.split('/')[-1].strip()
            elif dbpedia_nl_identifier.find('%2F') > -1:
                dbpedia_nl_identifier = dbpedia_nl_identifier.split('%2F')[-1].strip()
        if dbpedia_nl_identifier.lower().startswith("dbp:") or dbpedia_nl_identifier.lower().startswith("dbpedia_nl:"):
            dbpedia_nl_identifier = dbpedia_nl_identifier.split(':')[0]

        if dbpedia_nl_identifier[0].islower():
            dbpedia_nl_identifier = dbpedia_nl_identifier.title()
        if dbpedia_nl_identifier.find(' ') > -1:
            dbpedia_nl_identifier = dbpedia_nl_identifier.replace(" ","_")
        if dbpedia_nl_identifier.find('%20') > -1:
            dbpedia_nl_identifier = dbpedia_nl_identifier.replace("%20","_")

        if dbpedia_nl_identifier.find('+') > -1 or dbpedia_nl_identifier.find('_') > -1:
            q=""
            for item in dbpedia_nl_identifier.split('+'):
                q+="+OR+prefLabel:"+item
            for item in dbpedia_nl_identifier.split('_'):
                q+="+OR+prefLabel:"+item
            url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip() ,q)
        else:
            url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), "+OR+"+dbpedia_nl_identifier.strip())
        data = self.get(url)

        if not data:
            if self.debug:
                self.log.info("No DBPedia_nl data for: %s @ %s (via %s)" % (dbpedia_nl_identifier, url, self.backend))
            return(False)
        else:
            if type(data) == str:
                data=json.loads(data)            
            if data["response"]["numFound"] > 0:
                identifier = data["response"]["docs"][0]["id"].split('/')[-1]
                record = DBPedia([identifier.replace('json', 'jsond')], backend=self.backend, log_path=self.log_path, debug=self.debug)
                record.execute()
                self[record.keys()[0]] = record[record.keys()[0]]
                s=self[record.keys()[0]].pop("sameAs")[0]
                rec = sameAs([s], backend=self.backend, log_path=self.log_path, debug=self.debug)
                rec.execute()
                self[record.keys()[0]]["same"] = rec.values()[0]["same"]
コード例 #5
0
 def __init__(self):
     self.dbpedia = DBPedia()
     self.synonyms_base = SynonymsBase()
     self.tokenizer = Tokenizer()