Ejemplo n.º 1
0
def _get_measures_for_dataset(portal, dataset, datasetquality):
    graph = rdflib.Graph()
    # write dcat dataset into graph
    dataset_converter.dict_to_dcat(dataset.data.raw, portal, graph=graph)
    measures_g = rdflib.Graph()
    ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset)
    dataset_quality_to_dqv(measures_g, ds_id, datasetquality, dataset.snapshot)
    return measures_g, ds_id
Ejemplo n.º 2
0
    def get(self, portalid,snapshot, datasetid):
        with Timer(key="PortalDatasetData.get",verbose=True):
            session=current_app.config['dbsession']

            q=session.query(DatasetData) \
                .join(Dataset, DatasetData.md5 == Dataset.md5) \
                .filter(Dataset.snapshot==snapshot)\
                .filter(Dataset.portalid==portalid)\
                .filter(Dataset.id == datasetid)
            data=q.first()

            P= session.query(Portal).filter(Portal.id==portalid).first()
            return jsonify(dict_to_dcat(data.raw, P))
Ejemplo n.º 3
0
def contactPerOrga(Session, portal, snapshot, orga):
    q = Session.query(Dataset) \
        .filter(Dataset.portalid == portal.id) \
        .filter(Dataset.snapshot == snapshot) \
        .filter(Dataset.organisation == orga) \
        .join(DatasetData, DatasetData.md5 == Dataset.md5) \
        .join(DatasetQuality, DatasetQuality.md5 == Dataset.md5) \
        .add_entity(DatasetData).add_entity(DatasetQuality)
    pereMail = set([])
    for res in q:  # Dataset, DatasetData, DatasetQuality
        ds = row2dict(res)

        d = portal_fetch_processors.Dataset(snapshot=snapshot,
                                            portalID=portal.id,
                                            did=ds['id'],
                                            data=ds['raw'],
                                            status=200,
                                            software=portal.software)
        d.dcat = dict_to_dcat(ds['raw'], portal)
        contact = getContactPointValues(d)
        if len(contact) > 1:
            pereMail.add(contact[1])
    return pereMail
Ejemplo n.º 4
0
    def _get_quality(self, args, data, filename):
        try:
            content_type = 'application/json'
            default_url = 'http://missing.portal.url.com'
            portal_url = args.get('portal_url', default_url)
            if not portal_url:
                portal_url = default_url

            default_out = 'json'
            out_format = args.get('format', default_out)
            if not out_format:
                out_format = default_out

            filter_metrics = args.get('metric')

            if 'software' in args:
                software = args['software']

                # stub portal class
                class Portal:
                    def __init__(self, software, uri):
                        self.software = software
                        self.apiuri = uri

                p = Portal(software, portal_url)

                # get rdf graph and add measures and dimensions
                graph = rdflib.Graph()
                # write dcat dataset into graph
                dcat = dataset_converter.dict_to_dcat(data, p, graph=graph)
                measures_g = rdflib.Graph()
                ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset)
                datasetquality = DatasetQuality(data, dcat)
                metrics_dict = datasetquality.__dict__

                if filter_metrics:
                    metrics_dict = {m: metrics_dict[m] for m in filter_metrics}

                if out_format == 'json':
                    resp = jsonify(metrics_dict)
                elif out_format == 'json-ld':
                    dataset_quality_to_dqv(measures_g, ds_id, datasetquality,
                                           utils_snapshot.getCurrentSnapshot())
                    dqv_export.add_dimensions_and_metrics(measures_g)
                    resp = jsonify(
                        json.loads(measures_g.serialize(format="json-ld")))
                elif out_format == 'csv':
                    outstr = StringIO.StringIO()
                    w = csv.DictWriter(outstr, metrics_dict.keys())
                    w.writeheader()
                    w.writerow(metrics_dict)
                    resp = outstr.getvalue()
                    content_type = 'text/csv'
                else:
                    raise Exception('output format not supported: ' +
                                    out_format)

                filename = secure_filename(filename).split('/')[-1]
                return makeResponse(resp, filename, content_type=content_type)
            else:
                e = 'Portal software parameter required for conversion. ' \
                    '"software" should be "CKAN", "Socrata", or "OpenDataSoft".'
        except Exception as ex:
            e = ex.message

        resp = jsonify({'error': 'Could not parse JSON', 'message': e})
        resp.status_code = 406
        return resp
Ejemplo n.º 5
0
def convert(portal, data):
    g = rdflib.Graph()
    # write dcat dataset into graph
    dataset_converter.dict_to_dcat(data, portal, graph=g)

    ds_id = g.value(predicate=RDF.type, object=DCAT.Dataset)
    doc = {
        "@context": "http://schema.org",
        "@type": "Dataset",
        "@id": str(ds_id),
        "catalog": {
            "@type": "DataCatalog",
            "@id": portal.uri,
            "url": portal.uri,
            "spatialCoverage": portal.iso,
            "description": "Underlying software: " + portal.software
        }
    }
    # organization
    if (ds_id, DCTERMS.publisher, None) in g:
        pub = {"@type": "Organization"}
        orga = g.value(ds_id, DCTERMS.publisher)
        resp_party(g, pub, orga)
        # contact point
        if (ds_id, DCAT.contactPoint, None) in g:
            orga = g.value(ds_id, DCAT.contactPoint)
            resp_party(g, pub, orga)
        doc['publisher'] = pub

    # general fields
    if (ds_id, DCTERMS.title, None) in g:
        doc["name"] = str(g.value(ds_id, DCTERMS.title))
    if (ds_id, DCTERMS.description, None) in g:
        doc["description"] = str(g.value(ds_id, DCTERMS.description))
    if (ds_id, DCAT.landingPage, None) in g:
        doc["url"] = str(g.value(ds_id, DCTERMS.landingPage))
    if (ds_id, DCTERMS.spatial, None) in g:
        doc["spatialCoverage"] = str(g.value(ds_id, DCTERMS.spatial))
    if (ds_id, DCTERMS.temporal, None) in g:
        doc["datasetTimeInterval"] = str(g.value(ds_id, DCTERMS.temporal))
    if (ds_id, DCAT.theme, None) in g:
        doc["about"] = str(g.value(ds_id, DCAT.theme))
    if (ds_id, DCTERMS.modified, None) in g:
        doc["dateModified"] = str(g.value(ds_id, DCTERMS.modified))
    if (ds_id, DCTERMS.issued, None) in g:
        doc["datePublished"] = str(g.value(ds_id, DCTERMS.issued))
    if (ds_id, DCTERMS.language, None) in g:
        doc["inLanguage"] = str(g.value(ds_id, DCTERMS.language))

    if (ds_id, DCAT.keyword, None) in g:
        doc["keywords"] = []
        for keyword in g.objects(ds_id, DCAT.keyword):
            doc["keywords"].append(str(keyword))

    doc["distribution"] = []
    for dist_id in g.objects(ds_id, DCAT.distribution):
        dist = {"@type": "DataDownload", "@id": str(dist_id)}

        if (dist_id, DCTERMS.title, None) in g:
            dist["name"] = str(g.value(dist_id, DCTERMS.title))
        if (dist_id, DCTERMS.description, None) in g:
            dist["description"] = str(g.value(dist_id, DCTERMS.description))
        if (dist_id, DCTERMS.modified, None) in g:
            dist["dateModified"] = str(g.value(dist_id, DCTERMS.modified))
        if (dist_id, DCTERMS.issued, None) in g:
            dist["datePublished"] = str(g.value(dist_id, DCTERMS.issued))
        if (dist_id, DCTERMS['format'], None) in g:
            dist["encodingFormat"] = str(g.value(dist_id, DCTERMS['format']))
        if (dist_id, DCAT.byteSize, None) in g:
            dist["contentSize"] = str(g.value(dist_id, DCAT.byteSize))
        if (dist_id, DCAT.mediaType, None) in g:
            dist["fileFormat"] = str(g.value(dist_id, DCAT.mediaType))

        if (dist_id, DCAT.accessURL, None) in g:
            dist["contentUrl"] = str(g.value(dist_id, DCAT.accessURL))
        elif (dist_id, DCAT.downloadURL, None) in g:
            dist["contentUrl"] = str(g.value(dist_id, DCAT.downloadURL))

        if (dist_id, DCTERMS.license, None) in g:
            l = g.value(dist_id, DCTERMS.license)
            if isinstance(l, BNode):
                # look for description
                if (l, RDFS.label, None) in g:
                    dist["license"] = str(g.value(l, RDFS.label))
                elif (l, DCTERMS.identifier, None) in g:
                    dist["license"] = str(g.value(l, DCTERMS.identifier))
            else:
                dist["license"] = str(l)
        doc["distribution"].append(dist)

    return doc
Ejemplo n.º 6
0
def insertDatasets(P, db, iter, snapshot, batch=100, store_local=None):

    log.info("insertDatasets", portalid=P.id, snapshot=snapshot)

    bulk_obj = {'mr': [], 'd': [], 'dq': []}

    c = 0
    for i, d in enumerate(iter):
        c += 1
        with Timer(key='ProcessDataset'):
            #CREATE DATASET AND ADD

            with Timer(key='md5'):
                md5v = None if d.data is None else md5(d.data)

            if md5v:
                with Timer(key='dict_to_dcat'):
                    #analys quality
                    d.dcat = dict_to_dcat(d.data, P)

                DD = None
                DQ = None
                with Timer(key='db.datasetdataExists(md5v)'):
                    process = not db.exist_datasetdata(md5v)
                if process:
                    #DATATSET DATA
                    DD = createDatasetData(md5v, d)
                    try:
                        db.add(DD)  #primary key, needs to be inserted first
                        #DATATSET QUALTIY
                        #print "adding",md5v
                        DQ = createDatasetQuality(P, md5v, d)
                        bulk_obj['dq'].append(DQ)

                        #META RESOURCES
                        MQs = createMetaResources(md5v, d)
                        for MR in MQs:
                            bulk_obj['mr'].append(MR)
                    except Exception as e:
                        pass
                        #print "AND AGAIN",md5v, db.datasetdataExists(md5v)
                #DATATSET
                title = getTitle(d)
                title = title[0] if len(title) > 0 else None

                D = Dataset(
                    id=d.id,
                    snapshot=d.snapshot,
                    portalid=d.portal_id,
                    md5=md5v,
                    organisation=DD.organisation if DD else getOrganization(d),
                    title=title)

                bulk_obj['d'].append(D)

                # store metadata in local git directory
                try:
                    if store_local != None:
                        with Timer(key='store_to_local_git'):
                            if 'name' in d.data:
                                dir_name = d.data['name']
                            else:
                                dir_name = d.id
                            filename = os.path.join(store_local, P.id,
                                                    dir_name)
                            if not os.path.exists(filename):
                                os.makedirs(filename)

                            with open(os.path.join(filename, 'original.json'),
                                      'w') as f:
                                json.dump(d.data, f, indent=4)

                            g = rdflib.Graph()
                            g.parse(data=json.dumps(d.dcat), format='json-ld')
                            dqv_export.general_prov(g)
                            ds_id = g.value(predicate=RDF.type,
                                            object=DCAT.Dataset)
                            if not DQ:
                                DQ = db.datasetqualityExists(md5=md5v)
                            if DQ:
                                dqv_export.add_dimensions_and_metrics(g)
                                dataset_quality_to_dqv(g, ds_id, DQ, snapshot)
                            with open(
                                    os.path.join(filename, 'metadata.jsonld'),
                                    'w') as f:
                                g.serialize(f, format='json-ld')

                except Exception as exc:
                    ErrorHandler.handleError(log,
                                             "StoreToLocalGitException",
                                             exception=exc,
                                             pid=P.id,
                                             dataset=d.id,
                                             snapshot=snapshot,
                                             exc_info=True)
            else:
                D = Dataset(id=d.id,
                            snapshot=d.snapshot,
                            portalid=d.portal_id,
                            md5=md5v,
                            organisation=None)
                bulk_obj['d'].append(D)

        if i % batch == 0:
            bulkInsert(bulk_obj, db)
            for k in bulk_obj:
                bulk_obj[k] = []
        c = i

    #cleanup, commit all left inserts
    bulkInsert(bulk_obj, db)
    for k in bulk_obj:
        bulk_obj[k] = []
    log.info("InsertedDatasets", parsed=c, portalid=P.id, snapshot=snapshot)
Ejemplo n.º 7
0
def orgaReport(Session, portal, snapshot, orga, contact=None):
    with Timer(key=orga, verbose=True):
        q = Session.query(Dataset) \
            .filter(Dataset.portalid == portal.id) \
            .filter(Dataset.snapshot == snapshot) \
            .filter(Dataset.organisation == orga) \
            .join(DatasetData, DatasetData.md5 == Dataset.md5) \
            .join(DatasetQuality, DatasetQuality.md5 == Dataset.md5) \
            .add_entity(DatasetData).add_entity(DatasetQuality)
        pereMail = {}
        uris = set([])
        summary = {'status': defaultdict(int)}
        summary['status'][200] = 0
        summary['status'][404] = 0
        summary['status']['total'] = 0

        for res in q:  #Dataset, DatasetData, DatasetQuality
            ds = {}
            ds['dataset'] = row2dict(res[0])
            #ds['dataset']['external_uri']=portal.apiuri+"/katalog/dataset"
            ds['data'] = row2dict(res[1])
            ds['quality'] = row2dict(res[2])

            d = portal_fetch_processors.Dataset(snapshot=snapshot,
                                                portalID=portal.id,
                                                did=ds['dataset']['id'],
                                                data=ds['data']['raw'],
                                                status=200,
                                                software=portal.software)
            d.dcat = dict_to_dcat(ds['data']['raw'], portal)
            contactInfo = getContactPointValues(d)
            if len(contactInfo) > 1:
                if contact is not None and contact != contactInfo[1]:
                    continue

                ds['report'] = dataset_reporter.report(res[1], res[2],
                                                       portal.software)

                orgas = pereMail.setdefault(contactInfo[1], {})
                ds_list = orgas.setdefault(orga, [])
                ds_list.append(ds)
                ds['resourcesStatus'] = defaultdict(int)
                ds['resourcesStatus']['total'] = 0
                ds['resources'] = [row2dict(r) for r in res[1].resources]
                for resou in ds['resources']:
                    resri = Session.query(ResourceInfo).filter(
                        ResourceInfo.uri == resou['uri']).filter(
                            ResourceInfo.snapshot == snapshot).first()
                    if resri is not None:
                        resou['info'] = row2dict(resri)
                        ds['resourcesStatus'][resou['info']['status']] += 1
                        ds['resourcesStatus']['total'] += 1
                        if resou['uri'] not in uris:
                            summary['status'][resou['info']['status']] += 1
                            summary['status']['total'] += 1
                    if resou['uri'] not in uris:
                        uris.add(resou['uri'])

                ds['resourcesStatus'] = dict(ds['resourcesStatus'])
        ContactCount = 0
        #print "  Organisation:", orga
        for k, v in pereMail.items():
            print "  contact:", k
            for orga, ds_list in v.items():
                print "   ", orga, len(ds_list)
                ContactCount += len(ds_list)
                for ds in ds_list:
                    print "    >", ds['report']

        summary['status'] = dict(summary['status'])
        pereMail['summary'] = summary
        pereMail['summary']['totaluris'] = len(uris)
        return pereMail