def portalsdynamicy(): with Timer(key="get_portalsdynamicy", verbose=True): snapshot = getSnapshotfromTime(datetime.datetime.now()) Session = current_app.config['dbsession'] with Timer(key="query_portalsdynamicy", verbose=True): res = [ r for r in Session.query(Portal).join(PortalSnapshotDynamicity). filter(PortalSnapshotDynamicity.snapshot == snapshot).add_entity(PortalSnapshotDynamicity) ] results = [] keys = [ 'dindex', 'changefrequ', 'adddelratio', 'dyratio', 'staticRatio', 'addRatio', 'delRatio', 'updatedRatio' ] for r in res: d = row2dict(r) for k in keys: d[k] = r[1].__getattribute__(k) results.append(d) df = pd.DataFrame(results) for c in keys: df[c] = df[c].convert_objects(convert_numeric=True) return render('odpw_portals_dynamics.jinja', data={'portals': results}, keys=keys, snapshot=snapshot)
def getResourceInfo(session, dbc, portalid, snapshot, orga=None): with Timer(key="getResourceInfo", verbose=True): data = {} with Timer(key="query_getResourceInfoValid", verbose=True): data['valid'] = {} for valid in validURLDist(session, snapshot, portalid=portalid, orga=orga): data['valid'][valid[0]] = valid[1] with Timer(key="query_getResourceInfoStatus", verbose=True): data['status'] = {} if not orga: viewName = "view_{}_{}_{}".format('resstatus', portalid, snapshot) else: viewName = "view_{}_{}_{}_{}".format('resstatus', portalid, snapshot, orga) qorg = statusCodeDist(session, snapshot, portalid=portalid, orga=orga) q = withView(qorg, viewName, session, dbc) start = time.time() for res in q: data['status'][res[0]] = res[1] end = time.time() if (end - start) > 5: print("Create View {}".format(viewName)) createView(qorg, viewName, session) return {'resourcesInfo': data}
def aggregatePortalQuality(db, portalid, snapshot): log.info("Computing Aggregated Statistics", pid=portalid, snapshot=snapshot) with Timer(key=portalid + '-agg', verbose=True): df = aggregateByPortal3(db, portalid, snapshot) data = {} with Timer(key=portalid + '-mean', verbose=True): if df.shape[0] != 0: for i in boolTypeCol: if df[i].dtype.name == 'bool': df[i] = df[i].astype(int) else: df[i] = df[i].replace(True, 1) df[i] = df[i].replace(False, 0) #df[c]=df[c].apply(bool).astype(int) data = { k: float(str(v[['mean']]['mean'].round(decimals=2))) for k, v in dict(df.describe()).items() } data.update({ k + 'N': int(v[['count']]['count']) for k, v in dict(df.describe()).items() }) data['datasets'] = df.shape[0] PSQ = PortalSnapshotQuality(portalid=portalid, snapshot=snapshot, **data) db.add(PSQ) return PSQ
def portalResBody(snapshot, portalid): with Timer(key="get_portalRes", verbose=True): Session = current_app.config['dbsession'] dbc = current_app.config['dbc'] with Timer(key="query_portalRes", verbose=True): data = getResourceInfo(Session, dbc, portalid, snapshot) with Timer(key="query_getMetaResource", verbose=True): viewName = "view_{}_{}_{}".format('resinfo', portalid, snapshot) qorg = getResourceInfos(Session, snapshot, portalid) q = withView(qorg, viewName, Session, dbc) #print q start = time.time() data['uris'] = [row2dict(i) for i in q] end = time.time() if (end - start) > 5: print("Create View {}".format(viewName)) createView(qorg, viewName, Session) data.update(getPortalInfos(Session, portalid, snapshot)) #data['portals']= [ row2dict(r) for r in Session.query(Portal).all()] with Timer(key="query_portalResourceCount", verbose=True): r = current_app.config['dbsession'].query( Portal.resourcecount).filter(Portal.id == portalid) ps = [] for P in r: data['resources'] = P[0] return render("odpw_portal_resources_list.jinja", data=data, snapshot=snapshot, portalid=portalid)
def systemevolv(): with Timer(key="get_systemevolv", verbose=True): Session = current_app.config['dbsession'] with Timer(key="query_systemevolv", verbose=True): t = Session.query( PortalSnapshot.snapshot.label('snapshot'), Portal.software, PortalSnapshot.datasetcount, PortalSnapshot.resourcecount).join(Portal).subquery() q = Session.query( t.c.snapshot, t.c.software, func.count().label('count'), func.sum(t.c.resourcecount).label('resources'), func.sum(t.c.datasetcount).label('datasets')).group_by( t.c.snapshot, t.c.software) data = [row2dict(r) for r in q.all()] df = pd.DataFrame(data) with Timer(key="plot_systemevolv", verbose=True): p = systemEvolutionPlot(df) script, div = components(p) js_resources = INLINE.render_js() css_resources = INLINE.render_css() return render("odpw_system_evolution.jinja", plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources)
def portalLinkCheckCSV(portalid): with Timer(key="get_portalLinkCheckCSV", verbose=True): si = StringIO.StringIO() cw = csv.writer(si) snapshot = getCurrentSnapshot() Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_portalorgas", verbose=True): q = Session.query(Dataset.organisation) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation) data['organisations'] = [row2dict(res) for res in q] for o in data['organisations']: orga = o['organisation'] # with Timer(key="query_orga-emails", verbose=True): # portal=Session.query(Portal).filter(Portal.id==portalid).first() # # print('portal: ', portal, 'snapshot: ', snapshot, 'orga: ', orga) # data['contacts'] = contactPerOrga(Session, portal, snapshot, orga) # for cont in data['contacts']: linkcheck = 'https://tools.adequate.at' + url_for( '.orga_resources', portalid=portalid, snapshot=snapshot, orga=orga) cw.writerow([orga, linkcheck]) output = make_response(si.getvalue()) output.headers[ "Content-Disposition"] = "attachment; filename=export.csv" output.headers["Content-type"] = "text/csv" return output
def portalEvolution(snapshot, portalid): with Timer(key="get_portalEvolution", verbose=True): Session = current_app.config['dbsession'] data = {} with Timer(key="query_portalEvolution", verbose=True): for R in Session.query(PortalSnapshot).filter( PortalSnapshot.portalid == portalid): data[R.portalid + str(R.snapshot)] = row2dict(R) for R in Session.query(PortalSnapshotQuality).filter( PortalSnapshotQuality.portalid == portalid): data[R.portalid + str(R.snapshot)].update(row2dict(R)) df = pd.DataFrame([v for k, v in data.items()]) with Timer(key="plot_portalEvolution", verbose=True): p = evolutionCharts(df) script, div = components(p) js_resources = INLINE.render_js() css_resources = INLINE.render_css() data = getPortalInfos(Session, portalid, snapshot) return render("odpw_portal_evolution.jinja", plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources, snapshot=snapshot, portalid=portalid, data=data)
def getStatus(self, uri): with Timer(key="getStatus"): with Timer(key="HEADLookup"): headResp = self.headlookup(uri) if headResp.status_code == 400: with Timer(key="GETLookup"): headResp = self.getlookup(uri) return headResp
def aggregatePortalInfo(session, portalid, snapshot, dbc, limit=3): stats = {} with Timer(key=portalid + '-agg', verbose=True): ds = session.query(Dataset).filter( Dataset.snapshot == snapshot).filter( Dataset.portalid == portalid).count() rs = session.query(PortalSnapshot.resourcecount).filter( PortalSnapshot.portalid == portalid).filter( PortalSnapshot.snapshot == snapshot).first() print #print 'dsCount', ds #TODO fix resource count for key, cFunc, dFunc in [('organisation', organisationDist, distinctOrganisations), ('license', licenseDist, distinctLicenses), ('format', formatDist, distinctFormats)]: if key == 'format': total = row2dict(rs)['resourcecount'] else: total = ds with Timer(key='query_{}-{}'.format(portalid, key), verbose=True): s = [] viewName = "view_{}_{}_{}".format(key, portalid, snapshot) qorg = cFunc(session, snapshot, portalid=portalid) q = withView(qorg, viewName, session, dbc) start = time.time() if limit: q = q.limit(limit) else: q = q.all() for i in q: d = row2dict(i) #print d d['perc'] = d['count'] / (1.0 * total) if total > 0 else 0 s.append(d) t = sum(item['count'] for item in s) #print key, 'total',t if ds - t != 0: s.append({ key: 'Others', 'count': total - t, 'perc': (total - t) / (1.0 * total) }) end = time.time() if (end - start) > 5: log.info("Create View {}".format(viewName)) createView(qorg, viewName, session) #q = withView(qorg, viewName, session, dbc) stats[key] = { 'distinct': dFunc(session, snapshot, portalid=portalid).count(), 'top3Dist': s } return stats
def portaldash(): with Timer(key="get_portaldash", verbose=True): data = {} cursn = getSnapshotfromTime(datetime.datetime.now()) Session = current_app.config['dbsession'] with Timer(key="query_portaldash", verbose=True): data['portals'] = [ row2dict(r) for r in Session.query(Portal).all() ] return render("odpw_portaldash.jinja", data=data, snapshot=cursn)
def portalQuality(snapshot, portalid): with Timer(key="portalQuality", verbose=True): Session = current_app.config['dbsession'] df = portalSnapshotQualityDF(Session, portalid, snapshot) q = Session.query(PortalSnapshotQuality) \ .filter(PortalSnapshotQuality.portalid == portalid) \ .filter(PortalSnapshotQuality.snapshot == snapshot) qdata = None for r in q: qdata = row2dict(r) break d = [] datasets = int(qdata['datasets']) for inD in qa: for k, v in inD['metrics'].items(): k = k.lower() # TODO what to do if metric has no value? if qdata[k] != None and qdata[k] != 'None': value = float(qdata[k]) perc = int(qdata[k + 'N']) / (datasets * 1.0) if datasets > 0 else 0 c = { 'Metric': k, 'Dimension': inD['dimension'], 'dim_color': inD['color'], 'value': value, 'perc': perc } c.update(v) d.append(c) df = pd.DataFrame(d) with Timer(key="dataDF", verbose=True) as t: p = qualityChart(df) script, div = components(p) js_resources = INLINE.render_js() css_resources = INLINE.render_css() data = getPortalInfos(Session, portalid, snapshot) data['portals'] = [row2dict(r) for r in Session.query(Portal).all()] data['quality'] = qdata return render("odpw_portal_quality.jinja", plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources, snapshot=snapshot, portalid=portalid, data=data, qa=qa)
def portal(portalid, snapshot=getSnapshotfromTime(datetime.datetime.now())): with Timer(key="get_portal", verbose=True): current_sn = snapshot Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) dynamicityEnabled = current_app.config.get('dynamicity', False) with Timer(key="query_portal", verbose=True): q = Session.query(Portal).filter(Portal.id == portalid) \ .join(PortalSnapshotQuality, PortalSnapshotQuality.portalid == Portal.id) \ .filter(PortalSnapshotQuality.snapshot == snapshot) \ .join(PortalSnapshot, PortalSnapshot.portalid == Portal.id) \ .filter(PortalSnapshot.snapshot == snapshot) \ .add_entity(PortalSnapshot) \ .add_entity(PortalSnapshotQuality) if dynamicityEnabled: q = q.join(PortalSnapshotDynamicity, PortalSnapshotDynamicity.portalid == Portal.id) \ .filter(PortalSnapshotDynamicity.snapshot == snapshot) \ .add_entity(PortalSnapshotDynamicity) r = q.first() while r is None: snapshot = getPreviousWeek(snapshot) q = Session.query(Portal).filter(Portal.id == portalid) \ .join(PortalSnapshotQuality, PortalSnapshotQuality.portalid == Portal.id) \ .filter(PortalSnapshotQuality.snapshot == snapshot) \ .join(PortalSnapshot, PortalSnapshot.portalid == Portal.id) \ .filter(PortalSnapshot.snapshot == snapshot) \ .add_entity(PortalSnapshot) \ .add_entity(PortalSnapshotQuality) if dynamicityEnabled: q = q.join(PortalSnapshotDynamicity, PortalSnapshotDynamicity.portalid == Portal.id) \ .filter(PortalSnapshotDynamicity.snapshot == snapshot) \ .add_entity(PortalSnapshotDynamicity) r = q.first() data['portal'] = row2dict(r[0]) data['fetchInfo'] = row2dict(r[1]) data['fetchInfo']['duration'] = data['fetchInfo']['end'] - data[ 'fetchInfo']['start'] if dynamicityEnabled: data['dynamicity'] = row2dict(r[3]) data['quality'] = row2dict(r[2]) #with Timer(key="query_portal_agg", verbose=True): # data.update(aggregatePortalInfo(Session,portalid,snapshot,dbc)) return render("odpw_portal.jinja", snapshot=current_sn, portalid=portalid, data=data)
def checkUpdateRobots(self, uri): ttl = 36000 with Timer(key="checkUpdateRobots"): log.info("Robots.txt", url=uri, thread=current_thread()) canonical = hostname(uri) robots_url = roboturl(uri) while canonical in self.robots.robotsInProgress: #seems one thread is doing the parsing already sleep(5) if canonical not in self.robots.robots._cache: self.robots.robotsInProgress.add(canonical) with Timer(key="robots.fetch_parse"): try: # First things first, fetch the thing log.info('HTTP_GET', uri=robots_url) req = self.rsession.get(robots_url, timeout=10, allow_redirects=True, headers=headers) # And now parse the thing and update import reppy.parser r = reppy.parser.Rules(robots_url, req.status_code, req.content, time.time() + ttl) self.robots.robots.add(r) delay = self.robots.robots.delay(uri, user_agent) self.queue.addWait(uri, delay) except Exception as e: with Timer(key="robots.allowed.error"): ErrorHandler.handleError(log, "Robots.txt Exception", exception=e, url=uri, thread=current_thread()) try: import reppy.parser r = reppy.parser.Rules(robots_url, 499, "", time.time() + ttl) self.robots.robots.add(r) except Exception as e: ErrorHandler.handleError( log, "Robots.txt ExException", exception=e, url=uri, thread=current_thread()) self.robots.robotsInProgress.remove(canonical) return self.robots.robots.allowed(uri, user_agent)
def aggregateByPortal1(db, portalid, snapshot): with Timer(key="qualityDF1", verbose=True): result = defaultdict(list) q = db.Session.query(Dataset).filter( Dataset.snapshot == snapshot).filter(Dataset.portalid == portalid) print str(q) print '-' * 50 for d in q: with Timer(key="inspect1"): instance = inspect(d.data.quality) for key, x in instance.attrs.items(): result[key].append(x.value) return pd.DataFrame(result)
def getPortalInfos(Session, portalid, snapshot): with Timer(key="get_getPortalInfos", verbose=True): with Timer(key="query_getPortalInfos", verbose=True): snapshots = [ i[0] for i in Session.query(PortalSnapshot.snapshot).filter( PortalSnapshot.portalid == portalid).all() ] p = getPreviousWeek(snapshot) p = p if p in snapshots else None n = getNextWeek(snapshot) n = n if n in snapshots else None data = {'snapshots': {'list': snapshots, 'prev': p, 'next': n}} return data
def orga_resource(portalid, snapshot, orga): with Timer(key="get_orga_resource", verbose=True): Session = current_app.config['dbsession'] dbc = current_app.config['dbc'] data = getResourceInfo(Session, dbc, portalid, snapshot, orga) q = getResourceInfos(Session, snapshot, portalid, orga) data['resList'] = [] for i in q: dataset_id = i[1] dataset_title = i[2] orig_link = getDatasetURI(dataset_id, portalid) data['resList'].append({ 'uri': row2dict(i[0]), 'dataset': { 'uri': orig_link, 'title': dataset_title } }) data.update(getPortalInfos(Session, portalid, snapshot)) r = current_app.config['dbsession'].query( Portal.resourcecount).filter(Portal.id == portalid) for P in r: data['resources'] = P[0] return render("odpw_portal_resources_list.jinja", data=data, snapshot=snapshot, portalid=portalid)
def systemEvolutionBarPlot(df, yLabel, values): with Timer(key='systemEvolutionBarPlot', verbose=True): p = Bar(df, label='snapshot', values=values, agg='sum', stack='software', legend='bottom_left', bar_width=0.5, xlabel="Snapshots", ylabel=yLabel, responsive=True, height=200,tools='hover') glyph_renderers = p.select(GlyphRenderer) bar_source = [glyph_renderers[i].data_source for i in range(len(glyph_renderers))] hover = p.select(HoverTool) hover.tooltips = [ ('software',' @software'), ('value', '@height'), ] p.xaxis.formatter=FuncTickFormatter.from_py_func(getWeekStringTick) p.axis.minor_tick_line_color = None p.background_fill_color = "#fafafa" p.legend.location = "top_left" p.toolbar.logo = None p.toolbar_location = None legend=p.legend[0].legends p.legend[0].legends=[] l = Legend( location=(0, -30)) l.items=legend p.add_layout(l, 'right') return p
def portalLicenses(snapshot, portalid): with Timer(key="get_portalLicenseDist", verbose=True): Session = current_app.config['dbsession'] dbc = current_app.config['dbc'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_portalLicenseDist", verbose=True): data['portals'] = [ row2dict(r) for r in Session.query(Portal).all() ] data.update( aggregatePortalInfo(Session, portalid, snapshot, dbc, limit=None)) return render("odpw_portal_dist.jinja", data=data, snapshot=snapshot, portalid=portalid)
def portalreport(portalid, snapshot=getSnapshotfromTime(datetime.datetime.now())): with Timer(key="get_portal", verbose=True): Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_portalreport", verbose=True): q = Session.query(Dataset.organisation) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation) data['organisations'] = [row2dict(res) for res in q] return render("odpw_portal_report.jinja", snapshot=snapshot, portalid=portalid, data=data)
def systemchanges(): with Timer(key="get_systemchanges"): Session = current_app.config['dbsession'] cursn = getSnapshotfromTime(datetime.datetime.now()) prevWeek = getPreviousWeek(cursn) with Timer(key="query_systemchanges"): data_cur = { r.portalid: r for r in Session.query(PortalSnapshot).filter( PortalSnapshot.snapshot == cursn) } data_prev = { r.portalid: r for r in Session.query(PortalSnapshot).filter( PortalSnapshot.snapshot == prevWeek) } data = {'status_change': {}, 'ds_change': {}, 'res_change': {}} for pid, ps in data_cur.items(): if pid in data_prev: if ps.status == data_prev[pid].status: if ps.datasetcount != data_prev[pid].datasetcount: dsfrom = data_prev[pid].datasetcount if data_prev[ pid].datasetcount is not None else 0 dsto = ps.datasetcount if ps.datasetcount is not None else 0 data['ds_change'][pid] = {'from': dsfrom, 'to': dsto} elif ps.resourcecount != data_prev[pid].resourcecount: resfrom = data_prev[pid].resourcecount if data_prev[ pid].resourcecount is not None else 0 resto = ps.resourcecount if ps.resourcecount is not None else 0 data['res_change'][pid] = { 'from': resfrom, 'to': resto } else: data['status_change'][pid] = { 'from': data_prev[pid].status, 'to': ps.status } data['from'] = prevWeek data['to'] = cursn return render("odpw_system_changes.jinja", data=data)
def cli(args, dbm): sn = getCurrentSnapshot() db = DBClient(dbm) batch = args.batch concurrent = args.threads delay = args.delay log.info("START HEAD", batch=batch, delay=delay, threads=concurrent) rsession = requests.Session() robots = RobotsManager(rsession) q = DomainQueue(args.delay) filler = QueueFiller(db, q, robots, batch * 2, sn, concurrent) filler.daemon = True filler.filling_queue(batch=batch) resultQueue = Queue(maxsize=0) #start worker threads for i in range(concurrent): t = Worker(q=q, resultQueue=resultQueue, robots=robots, rsession=rsession, sn=sn) t.daemon = True t.start() filler.start() inserter = Inserter(db=db, resultQueue=resultQueue, domainQueue=q, batch=batch / 2) inserter.start() filler.join() inserter.join() Timer.printStats() import sys sys.exit(0)
def systemfetch(): with Timer(key="get_systemfetch"): Session = current_app.config['dbsession'] cursn = getSnapshotfromTime(datetime.datetime.now()) snapshots = getLastNSnapshots(cursn, n=5) nWeeksago = snapshots[-1] cnts = defaultdict(int) data = {} with Timer(key="query_systemfetch"): for r in Session.query(PortalSnapshot.snapshot, PortalSnapshot.start, PortalSnapshot.end - PortalSnapshot.start).filter( PortalSnapshot.snapshot > nWeeksago): sn, start, dur = r[0], r[1], r[2] cnts[sn] += 1 d = data.setdefault(sn, {}) if dur is not None: ds = d.setdefault(start, []) ds.append(dur.total_seconds()) for sn, d in data.items(): dd = [] gstart = min(d.keys()) for start, durations in d.items(): for dur in durations: delta = (start - gstart).total_seconds() + dur dd.append(delta) data[sn] = dd with Timer(key="plot_systemfetch"): p = fetchProcessChart(data, cnts) script, div = components(p) js_resources = INLINE.render_js() css_resources = INLINE.render_css() return render("odpw_system_fetch.jinja", plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources)
def portalOrgareport(portalid, orga, snapshot=getSnapshotfromTime(datetime.datetime.now())): with Timer(key="get_portal", verbose=True): Session = current_app.config['dbsession'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_portalreport", verbose=True): portal = Session.query(Portal).filter( Portal.id == portalid).first() data['contacts'] = contactPerOrga(Session, portal, snapshot, orga) return render("odpw_portal_report_contacts.jinja", snapshot=snapshot, portalid=portalid, data=data, organisation=orga)
def createDatasetQuality(P, md5v, dataset): with Timer(key='quality'): q = {} for id, qa in dcat_analyser().items(): q[qa.id.lower()] = qa.analyse_Dataset(dataset) DQ = DatasetQuality(md5=md5v, **q) return DQ
def get(self, portalid): with Timer(key="PortalSnapshots.get",verbose=True): session=current_app.config['dbsession'] q=session.query(PortalSnapshot.snapshot)\ .filter(PortalSnapshot.portalid==portalid) data=[row2dict(r) for r in q.all()] return jsonify(data)
def resourceInfo(snapshot, portalid, uri): with Timer(key="get_resourceInfo", verbose=True): #print snapshot,portalid,uri Session = current_app.config['dbsession'] dbc = current_app.config['dbc'] data = getPortalInfos(Session, portalid, snapshot) with Timer(key="query_resources", verbose=True): viewName = "view_{}_{}_{}".format('resinfo', portalid, snapshot) qorg = getResourceInfos(Session, snapshot, portalid) q = withView(qorg, viewName, Session, dbc) start = time.time() data['resources'] = [row2dict(r) for r in q.all()] end = time.time() if (end - start) > 5: print("Create View {}".format(viewName)) try: createView(qorg, viewName, Session) except Exception as e: if 'already exists' in e.message: pass else: raise e with Timer(key="query_resourceInfo", verbose=True): q = Session.query(ResourceInfo) \ .filter(ResourceInfo.uri == uri) #print q data['resourceInfo'] = [row2dict(r) for r in q.all()] for r in data['resourceInfo']: if 'header' in r: if r['header'] is None: r['header'] = "" else: #print type(r['header']),r['header'],r r['header'] = ast.literal_eval(str(r['header'])) return render("odpw_portal_resource.jinja", snapshot=snapshot, portalid=portalid, uri=uri, data=data)
def createMetaResources(md5v, dataset): with Timer(key='createMetaResources'): res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs( dataset) bulk_mr = [] uris = [] for uri in res: valid = True try: uri = urlnorm.norm(uri.strip()) except Exception as e: log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message) uri = uri valid = False f = getDistributionFormatWithURL(dataset, uri) m = getDistributionMediaTypeWithURL(dataset, uri) s = getDistributionSizeWithURL(dataset, uri) c = getDistributionCreationDateWithURL(dataset, uri) mod = getDistributionModificationDateWithURL(dataset, uri) try: s_uri = safe_url_string(uri, 'utf-8') uri = escape_ajax(s_uri) except Exception as exc: ErrorHandler.handleError(log, "safe_url_string", exception=exc, md5=md5, uri=uri, exc_info=True) uri = uri if uri in uris: log.debug("WARNING, duplicate URI", dataset=dataset.id, md5=md5v, uri=uri, format=f, media=m) continue try: s = int(float(s)) if s is not None else None except Exception as e: s = None MR = MetaResource(uri=uri, md5=md5v, media=m, valid=valid, format=normaliseFormat(f), size=s, created=toDatetime(c), modified=toDatetime(mod)) bulk_mr.append(MR) uris.append(uri) return bulk_mr
def getPortalDatasets(Session, portalid, snapshot): with Timer(key="getPortalDatasets", verbose=True): return { "datasets": [ row2dict(r) for r in Session.query(Dataset.title, Dataset.id).filter( Dataset.portalid == portalid).filter( Dataset.snapshot == snapshot).all() ] }
def portalsquality(): with Timer(key="get_portalsquality", verbose=True): Session = current_app.config['dbsession'] snapshot = getSnapshotfromTime(datetime.datetime.now()) with Timer(key="query_portalsquality"): results = [ row2dict(r) for r in Session.query( Portal, Portal.datasetcount, Portal.resourcecount).join(PortalSnapshotQuality).filter( PortalSnapshotQuality.snapshot == snapshot).add_entity( PortalSnapshotQuality) ] keys = [i.lower() for q in qa for i in q['metrics']] df = pd.DataFrame(results) #print df for c in keys: #print c,df[c] #print '___'*10 df[c] = df[c].convert_objects(convert_numeric=True) dfiso = df.groupby(['iso']) dfiso=dfiso.agg('mean')\ .join(pd.DataFrame(dfiso.size(),columns=['count'])) resultsIso = dfiso.reset_index().to_dict(orient='records') dfsoft = df.groupby(['software']) dfsoft=dfsoft.agg('mean')\ .join(pd.DataFrame(dfsoft.size(),columns=['count'])) resultSoft = dfsoft.reset_index().to_dict(orient='records') return render('odpw_portals_quality.jinja', data={ 'portals': results, 'iso': resultsIso, 'soft': resultSoft }, keys=keys, snapshot=snapshot)
def run(self): log.info("Started", thread=current_thread()) while True: with Timer(key="doWork"): try: with Timer(key="q.get"): uri = self.queue.get() if uri is None: break try: #check if this is a valid URL, if we have an exception, done o = urlparse(uri) if o.scheme.startswith('http'): res = None if self.checkUpdateRobots(uri): res = self.getStatus(uri) else: log.info("ROBOTS DENIED", uri=uri, thread=current_thread()) self.handle_request(uri, res, None) else: raise InvalidSchema( "No connection adapters were found for " + uri) except Exception as e: ErrorHandler.handleError(log, exception=e, msg="doWork", exc_info=True, url=uri, thread=current_thread()) self.handle_request(uri, None, e) except Exception as e: ErrorHandler.handleError(log, exception=e, msg="uncaught in doWork", exc_info=True, thread=current_thread()) log.info("STOPPED", thread=current_thread())