def dataset(name): if not name in DATASETS: DATASETS[name] = Dataset(name, api_key=app.config['NOMENKLATURA_API_KEY']) if app.config['NOMENKLATURA_PRELOAD']: for entity in DATASETS[name].entities(): ENTITIES[(name, entity.name.lower())] = entity for alias in DATASETS[name].aliases(): if alias.is_invalid: ENTITIES[(name, alias.name.lower())] = \ Dataset.Invalid({'dataset': alias.dataset, 'name': alias.name}) elif alias.is_matched: ENTITIES[(name, alias.name.lower())] = \ Entity(DATASETS[name], alias.entity) return DATASETS[name]
def extract_sections(party): doc = load_doc(party) nomenklatura = Dataset('btw13-titles', api_key=os.environ.get('NOMENKLATURA_API_KEY')) current = Section(party) for i, h in enumerate(doc.findall('.//*')): if h.tag in ['h1', 'h2']: if current.valid: yield current current = Section(party) current.title = h.text current.level = h.tag[1:] fp = '[%s:%s] %s' % (party, h.tag, h.text) try: entity = nomenklatura.lookup(fp) current.topic = entity.name except Exception, e: print [fp] print e #if h.getparent() == doc: # #print "XXX", h current.texts.append(h.text)
from nomenklatura import Dataset, NomenklaturaException dataset = Dataset('iso-countries') print [dataset.label] #print list(dataset.entities()) #cyp = dataset.entity_by_name('cyprus') #print cyp.__data__ #cyp.reviewed = False #print cyp.__data__ #cyp.update() e = dataset.entity_by_name('christma island') print repr(e) print repr(e.dereference())
from nomenklatura import Dataset from utils import KorboDataset NOMENKLATURA_URL = 'http://nomenklatura.venturi.fbk.eu/' NOMENKLATURA_PROJ = 'pois' KORBO_URL = 'http://korbo.netseven.it/' KORBO_PROJ = '101' nk_dataset = Dataset(NOMENKLATURA_PROJ, host=NOMENKLATURA_URL) ko_dataset = KorboDataset(KORBO_PROJ, host=KORBO_URL) nk_values = sorted(nk_dataset.values(), key=lambda x: x.value) ko_values = sorted(ko_dataset.values(), key=lambda x: x.value) nk_values_set = frozenset(nk_values) ko_values_set = frozenset(ko_values) if len(nk_values) != len(nk_values_set): print "FOUND DUPLICATES ON NOMENKLATURA: ", nk_values - nk_values_set if len(ko_values) != len(ko_values_set): print "FOUND DUPLICATES ON KORBO: ", nk_values - nk_values_set cnt = 0 for nk_value in nk_values: if nk_value.value not in ko_values: cnt += 1 print nk_value, "NOT FOUND ON KORBO" print "KORBO PROJ:", cnt, "issues found"
from nomenklatura import Dataset from utils import KorboDataset NOMENKLATURA_URL = 'http://nomenklatura.venturi.fbk.eu/' NOMENKLATURA_PROJ = 'pois' KORBO_URL = 'http://korbo.netseven.it/' KORBO_PROJ = '101' nk_dataset = Dataset(NOMENKLATURA_PROJ, host=NOMENKLATURA_URL) ko_dataset = KorboDataset(KORBO_PROJ, host=KORBO_URL) nk_values = sorted(nk_dataset.values(), key=lambda x: x.value) ko_values = sorted(ko_dataset.values(), key=lambda x: x.value) nk_values_set = frozenset(nk_values) ko_values_set = frozenset(ko_values) if len(nk_values) != len(nk_values_set): print "FOUND DUPLICATES ON NOMENKLATURA: ", nk_values - nk_values_set if len(ko_values) != len(ko_values_set): print "FOUND DUPLICATES ON KORBO: ", nk_values - nk_values_set cnt = 0 for nk_value in nk_values: if nk_value.value not in ko_values: cnt += 1 print nk_value, "NOT FOUND ON KORBO" print "KORBO PROJ:", cnt, "issues found" def validate_label(label): if not label[0:1].isupper():