Beispiel #1
0
def dataset(name):
    if not name in DATASETS:
        DATASETS[name] = Dataset(name, api_key=app.config['NOMENKLATURA_API_KEY'])
        if app.config['NOMENKLATURA_PRELOAD']:
            for entity in DATASETS[name].entities():
                ENTITIES[(name, entity.name.lower())] = entity
            for alias in DATASETS[name].aliases():
                if alias.is_invalid:
                    ENTITIES[(name, alias.name.lower())] = \
                        Dataset.Invalid({'dataset': alias.dataset, 'name': alias.name})
                elif alias.is_matched:
                    ENTITIES[(name, alias.name.lower())] = \
                        Entity(DATASETS[name], alias.entity)
    return DATASETS[name]
Beispiel #2
0
def extract_sections(party):
    doc = load_doc(party)

    nomenklatura = Dataset('btw13-titles',
                           api_key=os.environ.get('NOMENKLATURA_API_KEY'))

    current = Section(party)
    for i, h in enumerate(doc.findall('.//*')):
        if h.tag in ['h1', 'h2']:
            if current.valid:
                yield current
            current = Section(party)
            current.title = h.text
            current.level = h.tag[1:]
            fp = '[%s:%s] %s' % (party, h.tag, h.text)
            try:
                entity = nomenklatura.lookup(fp)
                current.topic = entity.name
            except Exception, e:
                print [fp]
                print e
        #if h.getparent() == doc:
        #    #print "XXX", h
        current.texts.append(h.text)
Beispiel #3
0
from nomenklatura import Dataset, NomenklaturaException

dataset = Dataset('iso-countries')
print [dataset.label]

#print list(dataset.entities())

#cyp = dataset.entity_by_name('cyprus') 
#print cyp.__data__
#cyp.reviewed = False
#print cyp.__data__
#cyp.update()

e = dataset.entity_by_name('christma island') 
print repr(e)
print repr(e.dereference())
Beispiel #4
0
from nomenklatura import Dataset
from utils import KorboDataset


NOMENKLATURA_URL = 'http://nomenklatura.venturi.fbk.eu/'
NOMENKLATURA_PROJ = 'pois'

KORBO_URL = 'http://korbo.netseven.it/'
KORBO_PROJ = '101'


nk_dataset = Dataset(NOMENKLATURA_PROJ, host=NOMENKLATURA_URL)
ko_dataset = KorboDataset(KORBO_PROJ, host=KORBO_URL)
nk_values = sorted(nk_dataset.values(), key=lambda x: x.value)
ko_values = sorted(ko_dataset.values(), key=lambda x: x.value)

nk_values_set = frozenset(nk_values)
ko_values_set = frozenset(ko_values)
if len(nk_values) != len(nk_values_set):
    print "FOUND DUPLICATES ON NOMENKLATURA: ", nk_values - nk_values_set
if len(ko_values) != len(ko_values_set):
    print "FOUND DUPLICATES ON KORBO: ", nk_values - nk_values_set


cnt = 0
for nk_value in nk_values:
    if nk_value.value not in ko_values:
        cnt += 1
        print nk_value, "NOT FOUND ON KORBO"
print "KORBO PROJ:", cnt, "issues found"
Beispiel #5
0
from nomenklatura import Dataset
from utils import KorboDataset

NOMENKLATURA_URL = 'http://nomenklatura.venturi.fbk.eu/'
NOMENKLATURA_PROJ = 'pois'

KORBO_URL = 'http://korbo.netseven.it/'
KORBO_PROJ = '101'

nk_dataset = Dataset(NOMENKLATURA_PROJ, host=NOMENKLATURA_URL)
ko_dataset = KorboDataset(KORBO_PROJ, host=KORBO_URL)
nk_values = sorted(nk_dataset.values(), key=lambda x: x.value)
ko_values = sorted(ko_dataset.values(), key=lambda x: x.value)

nk_values_set = frozenset(nk_values)
ko_values_set = frozenset(ko_values)
if len(nk_values) != len(nk_values_set):
    print "FOUND DUPLICATES ON NOMENKLATURA: ", nk_values - nk_values_set
if len(ko_values) != len(ko_values_set):
    print "FOUND DUPLICATES ON KORBO: ", nk_values - nk_values_set

cnt = 0
for nk_value in nk_values:
    if nk_value.value not in ko_values:
        cnt += 1
        print nk_value, "NOT FOUND ON KORBO"
print "KORBO PROJ:", cnt, "issues found"


def validate_label(label):
    if not label[0:1].isupper():