def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) rdf_dir = args.dir if not os.path.exists(rdf_dir): os.mkdir(rdf_dir) sn_dir = os.path.join(rdf_dir, str(sn)) if not os.path.exists(sn_dir): os.mkdir(sn_dir) tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, sn_dir)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, sn_dir)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(streamCSVs, tasks): pid, sn = x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def portalLinkCheckCSV(portalid): with Timer(key="get_portalLinkCheckCSV", verbose=True): si = StringIO.StringIO() cw = csv.writer(si) snapshot = getCurrentSnapshot() Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_portalorgas", verbose=True): q = Session.query(Dataset.organisation) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation) data['organisations'] = [row2dict(res) for res in q] for o in data['organisations']: orga = o['organisation'] # with Timer(key="query_orga-emails", verbose=True): # portal=Session.query(Portal).filter(Portal.id==portalid).first() # # print('portal: ', portal, 'snapshot: ', snapshot, 'orga: ', orga) # data['contacts'] = contactPerOrga(Session, portal, snapshot, orga) # for cont in data['contacts']: linkcheck = 'https://tools.adequate.at' + url_for( '.orga_resources', portalid=portalid, snapshot=snapshot, orga=orga) cw.writerow([orga, linkcheck]) output = make_response(si.getvalue()) output.headers[ "Content-Disposition"] = "attachment; filename=export.csv" output.headers["Content-type"] = "text/csv" return output
def cli(args,dbm): if args.snapshot: sn = args.snapshot else: sn = getCurrentSnapshot() dbConf= readDBConfFromFile(args.config) db= DBClient(dbm) tasks=[] if args.portalid: P = db.Session.query(Portal).filter(Portal.id==args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf,sn)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf,sn)) log.info("START FRESHNESS", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(change_history,tasks): pid,sn =x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def cli(args, dbm): db= DBClient(dbm) git = None if args.config: with open(args.config) as f_conf: config = yaml.load(f_conf) if 'git' in config: git = config['git'] if not git: log.warn("GIT LOCATION OR URL NOT SPECIFIED") return sn = getCurrentSnapshot() if args.portalid: P =db.Session.query(Portal).filter(Portal.id==args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: git_update(P, sn, git) else: for P in db.Session.query(Portal): git_update(P, sn, git)
def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() session = current_app.config['dbsession'] q = session.query(DatasetData) \ .join(Dataset, DatasetData.md5 == Dataset.md5) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) data = q.first() p = session.query(Portal).filter(Portal.id == portalid).first() doc = dcat_to_schemadotorg.convert(p, data.raw) timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/schemadotorg>' resp = add_memento_header(jsonify(doc), '<' + doc['@id'] + '>', timegate, sn) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/schemadotorg>' resp.headers['Content-Location'] = full_url return resp
def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() session = current_app.config['dbsession'] p = session.query(Portal).filter(Portal.id == portalid).first() q = session.query(DatasetQuality) \ .join(Dataset, DatasetQuality.md5 == Dataset.md5) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) dataset_qual = q.first() q = session.query(Dataset) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) dataset = q.first() # get rdf graph and add measures and dimensions g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual) dqv_export.add_dimensions_and_metrics(g) resp = jsonify(json.loads(g.serialize(format="json-ld"))) timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>' resp = add_memento_header(resp, ds_id.n3(), timegate, sn) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/dqv>' resp.headers['Content-Location'] = full_url return resp
def cli(args, dbm): dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) if not args.sn: sn = getCurrentSnapshot() else: sn = args.sn directory = args.directory tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, directory)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, directory)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) portals = [] pool = Pool(args.processors) for x in pool.imap(generate_schemadotorg_files, tasks): pid, lastmod, sn = x[0].id, x[1], x[2] portals.append((pid, lastmod)) log.info("RECEIVED RESULT", portalid=pid) create_portal_sitemapindex(portals, directory)
def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) aggregateFormatDist(db, sn)
def portalRes(portalid, snapshot=None): if not snapshot: snapshot = getCurrentSnapshot() Session = current_app.config['dbsession'] data = {} data.update(getPortalInfos(Session, portalid, snapshot)) return render("odpw_portal_resources.jinja", data=data, snapshot=snapshot, portalid=portalid)
def cli(args, dbm): sn = getCurrentSnapshot() db=DBClient(dbm) settings=get_project_settings() crawler = CrawlerProcess(settings) seen=set([]) crawler.crawl(HeadLookups,snapshot=sn, db=db, batch=args.batch, seen=seen, iso=args.iso, exclude_iso=args.exclude_iso) crawler.start()
def cli(args, dbm): datadir = None git_location = None git_url = None if args.config: with open(args.config) as f_conf: config = yaml.load(f_conf) if 'data' in config: datadir = config['data']['datadir'] if 'git' in config and 'datadir' in config['git']: git_location = config['git']['datadir'] if 'git' in config and 'external' in config['git']: git_url = config['git']['external'] if datadir is None: log.error("No data dir specified in config", config=args.config) return log.info("Init datadir", datadir=datadir) sn = getCurrentSnapshot() api = DBClient(dbm=dbm) if args.portal: P = api.Session.query(Portal).filter(Portal.id == args.portal).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portal) return else: crawler = CrawlerProcess() crawler.crawl(DataMonitorSpider, api=api, datadir=datadir, snapshot=sn, format=args.format, portalID=P.id, git_location=git_location, csvclean=args.clean, git_url=git_url + P.id + '/') crawler.start() else: for P in api.Session.query(Portal): log.warn("DOWNLOAD RESOURCES", portalid=P.id) crawler = CrawlerProcess() crawler.crawl(DataMonitorSpider, api=api, datadir=datadir, snapshot=sn, format=args.format, portalID=P.id, git_location=git_location, csvclean=args.clean, git_url=git_url + P.id + '/') crawler.start()
def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() resp = get_dataset(portalid, sn, datasetid) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = HOST + '/' + portalid + '/' + d.strftime("%Y%m%d") + '/' + datasetid resp.headers['Content-Location'] = full_url return resp
def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) store_local = None if args.config: with open(args.config) as f: config = yaml.load(f) if 'git' in config and 'datadir' in config['git']: store_local = config['git']['datadir'] tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, store_local)) else: if args.repair: valid = db.Session.query(PortalSnapshot.portalid).filter( PortalSnapshot.snapshot == sn).filter( PortalSnapshot.status == 200).subquery() for P in db.Session.query(Portal).filter(Portal.id.notin_(valid)): PS = db.Session.query(PortalSnapshot).filter( PortalSnapshot.snapshot == sn).filter( PortalSnapshot.portalid == P.id) PS.delete(synchronize_session=False) PSQ = db.Session.query(PortalSnapshotQuality).filter( PortalSnapshotQuality.snapshot == sn).filter( PortalSnapshotQuality.portalid == P.id) PSQ.delete(synchronize_session=False) tasks.append((P, dbConf, sn, store_local)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, store_local)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(fetchHttp, tasks): pid, sn = x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def cli(args, dbm): sn = getCurrentSnapshot() db = DBClient(dbm) batch = args.batch concurrent = args.threads delay = args.delay log.info("START HEAD", batch=batch, delay=delay, threads=concurrent) rsession = requests.Session() robots = RobotsManager(rsession) q = DomainQueue(args.delay) filler = QueueFiller(db, q, robots, batch * 2, sn, concurrent) filler.daemon = True filler.filling_queue(batch=batch) resultQueue = Queue(maxsize=0) #start worker threads for i in range(concurrent): t = Worker(q=q, resultQueue=resultQueue, robots=robots, rsession=rsession, sn=sn) t.daemon = True t.start() filler.start() inserter = Inserter(db=db, resultQueue=resultQueue, domainQueue=q, batch=batch / 2) inserter.start() filler.join() inserter.join() Timer.printStats() import sys sys.exit(0)
def portalDynamicLinkCheck(portalid): snapshot = getCurrentSnapshot() return portalLinkCheck(snapshot, portalid)
def csv_clean(filename, git_url, orig_url, metadata, stream_orig=True, max_file_size=10): # get the file size in MB filesize = os.path.getsize(filename) >> 20 if filesize > max_file_size: return # TODO read csv files in dir, run pyyacp and and track modifications, read jsonld, add new resource with description and modifications out_encoding = 'utf-8' out_delimiter = ',' reader = yacp.YACParser(filename=filename) deli = reader.meta['delimiter'] encoding = reader.meta['encoding'] descriptionLines = reader.descriptionLines header_line = reader.header_line f_name = os.path.basename(filename) cleaned_path = os.path.join(os.path.dirname(filename), '..', 'cleaned') if not os.path.exists(cleaned_path): os.mkdir(cleaned_path) cleaned_content = reader.generate(delimiter=out_delimiter, comments=False) with codecs.open(os.path.join(cleaned_path, f_name), 'w', out_encoding) as out_f: out_f.write(cleaned_content.decode(out_encoding)) g = rdflib.Graph() g.parse(metadata, format="json-ld") snapshot = getCurrentSnapshot() activity = adequate_prov(g, snapshot) if stream_orig: try: # add csvw info to orig resource stream_csv.addMetadata(orig_url, snapshot, g, csvw_activity=activity) except Exception as e: ErrorHandler.handleError(log, "GetCSVWMetadata", exception=e, url=orig_url, snapshot=snapshot, exc_info=True) dataset_ref = g.value(predicate=RDF.type, object=DCAT.Dataset) repo_name = g.value(subject=dataset_ref, predicate=AD.repositoryName) # add new resource git_res_page = git_url + str( repo_name) + '/' + 'tree/master/cleaned/' + f_name git_res_raw = git_url + str( repo_name) + '/' + 'raw/master/cleaned/' + f_name distribution = URIRef(git_res_page) access_url = URIRef(git_res_raw) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) g.add((distribution, DCAT.accessURL, access_url)) # prov information g.add((access_url, RDF.type, PROV.Entity)) g.add((access_url, PROV.wasDerivedFrom, URIRef(orig_url))) g.add((access_url, PROV.wasGeneratedBy, activity)) g.add((activity, PROV.generated, access_url)) # add CSV modifications to metadata if not header_line: g.add((access_url, AD.csvCleanModification, AD.GenericHeader)) if deli != out_delimiter: g.add((access_url, AD.csvCleanModification, AD.DefaultDelimiter)) if encoding != out_encoding: g.add((access_url, AD.csvCleanModification, AD.Utf8Encoding)) if descriptionLines: g.add((access_url, AD.csvCleanModification, AD.DropCommentLines)) # add comment lines metadata for l in descriptionLines: out = StringIO() w = csv.writer(out) w.writerow([v.encode(out_encoding) for v in l]) g.add((distribution, RDFS.comment, Literal(out.getvalue()))) g.serialize(destination=metadata, format='json-ld')
def _get_quality(self, args, data, filename): try: content_type = 'application/json' default_url = 'http://missing.portal.url.com' portal_url = args.get('portal_url', default_url) if not portal_url: portal_url = default_url default_out = 'json' out_format = args.get('format', default_out) if not out_format: out_format = default_out filter_metrics = args.get('metric') if 'software' in args: software = args['software'] # stub portal class class Portal: def __init__(self, software, uri): self.software = software self.apiuri = uri p = Portal(software, portal_url) # get rdf graph and add measures and dimensions graph = rdflib.Graph() # write dcat dataset into graph dcat = dataset_converter.dict_to_dcat(data, p, graph=graph) measures_g = rdflib.Graph() ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset) datasetquality = DatasetQuality(data, dcat) metrics_dict = datasetquality.__dict__ if filter_metrics: metrics_dict = {m: metrics_dict[m] for m in filter_metrics} if out_format == 'json': resp = jsonify(metrics_dict) elif out_format == 'json-ld': dataset_quality_to_dqv(measures_g, ds_id, datasetquality, utils_snapshot.getCurrentSnapshot()) dqv_export.add_dimensions_and_metrics(measures_g) resp = jsonify( json.loads(measures_g.serialize(format="json-ld"))) elif out_format == 'csv': outstr = StringIO.StringIO() w = csv.DictWriter(outstr, metrics_dict.keys()) w.writeheader() w.writerow(metrics_dict) resp = outstr.getvalue() content_type = 'text/csv' else: raise Exception('output format not supported: ' + out_format) filename = secure_filename(filename).split('/')[-1] return makeResponse(resp, filename, content_type=content_type) else: e = 'Portal software parameter required for conversion. ' \ '"software" should be "CKAN", "Socrata", or "OpenDataSoft".' except Exception as ex: e = ex.message resp = jsonify({'error': 'Could not parse JSON', 'message': e}) resp.status_code = 406 return resp
def portalDataset(snapshot, portalid, dataset): with Timer(key="portalDataset", verbose=True): if not snapshot: snapshot = getCurrentSnapshot() Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) #data['portals']= [ row2dict(r) for r in Session.query(Portal).all()] data.update(getPortalDatasets(Session, portalid, snapshot)) dd = None if dataset: for dt in data['datasets']: if dt['id'] == dataset: dd = dt break with Timer(key="getPortalDatasets_datasetData", verbose=True): r = Session.query(DatasetData).join(Dataset).filter( Dataset.id == dataset).join(DatasetQuality).add_entity( DatasetQuality).first() data['datasetData'] = row2dict(r) software = Session.query( Portal.software).filter(Portal.id == portalid).first()[0] if software == 'Socrata': data['json'] = data['datasetData']['raw']['view'] else: data['json'] = data['datasetData']['raw'] data['report'] = dataset_reporter.report(r[0], r[1], software=None) #with Timer(key="getSchemadotorgDatasets", verbose=True): # q = Session.query(Portal).filter(Portal.id == portalid) # p = q.first() # schemadotorg = json.dumps(dcat_to_schemadotorg.convert(p, r[0]), indent=3) with Timer(key="getPortalDatasets_resources", verbose=True): q = Session.query(MetaResource, ResourceInfo).filter( MetaResource.md5 == r[0].md5).outerjoin( ResourceInfo, and_(ResourceInfo.uri == MetaResource.uri, ResourceInfo.snapshot == snapshot)) data['resources'] = [row2dict(r) for r in q.all()] for r in data['resources']: if 'header' in r and isinstance(r['header'], basestring): r['header'] = ast.literal_eval(r['header']) with Timer(key="getPortalDatasets_versions", verbose=True): q = Session.query(Dataset.md5, func.min(Dataset.snapshot).label('min'), func.max(Dataset.snapshot).label('max')).filter( Dataset.id == dataset).group_by(Dataset.md5) r = [row2dict(r) for r in q.all()] print r versions = {} for i in r: a = versions.setdefault(i['md5'], []) a.append({'min': i['min'], 'max': i['max']}) data['versions'] = r return render("odpw_portal_dataset.jinja", data=data, snapshot=snapshot, portalid=portalid, dataset=dd, qa=qa, error=errorStatus)