def portalsdynamicy():
    with Timer(key="get_portalsdynamicy", verbose=True):
        snapshot = getSnapshotfromTime(datetime.datetime.now())
        Session = current_app.config['dbsession']

        with Timer(key="query_portalsdynamicy", verbose=True):
            res = [
                r
                for r in Session.query(Portal).join(PortalSnapshotDynamicity).
                filter(PortalSnapshotDynamicity.snapshot ==
                       snapshot).add_entity(PortalSnapshotDynamicity)
            ]
        results = []
        keys = [
            'dindex', 'changefrequ', 'adddelratio', 'dyratio', 'staticRatio',
            'addRatio', 'delRatio', 'updatedRatio'
        ]
        for r in res:
            d = row2dict(r)
            for k in keys:
                d[k] = r[1].__getattribute__(k)

            results.append(d)

        df = pd.DataFrame(results)

        for c in keys:
            df[c] = df[c].convert_objects(convert_numeric=True)

        return render('odpw_portals_dynamics.jinja',
                      data={'portals': results},
                      keys=keys,
                      snapshot=snapshot)
def getResourceInfo(session, dbc, portalid, snapshot, orga=None):
    with Timer(key="getResourceInfo", verbose=True):
        data = {}

        with Timer(key="query_getResourceInfoValid", verbose=True):
            data['valid'] = {}
            for valid in validURLDist(session,
                                      snapshot,
                                      portalid=portalid,
                                      orga=orga):
                data['valid'][valid[0]] = valid[1]
        with Timer(key="query_getResourceInfoStatus", verbose=True):
            data['status'] = {}
            if not orga:
                viewName = "view_{}_{}_{}".format('resstatus', portalid,
                                                  snapshot)
            else:
                viewName = "view_{}_{}_{}_{}".format('resstatus', portalid,
                                                     snapshot, orga)

            qorg = statusCodeDist(session,
                                  snapshot,
                                  portalid=portalid,
                                  orga=orga)
            q = withView(qorg, viewName, session, dbc)
            start = time.time()
            for res in q:
                data['status'][res[0]] = res[1]
            end = time.time()
            if (end - start) > 5:
                print("Create View {}".format(viewName))
                createView(qorg, viewName, session)

        return {'resourcesInfo': data}
Exemple #3
0
def aggregatePortalQuality(db, portalid, snapshot):
    log.info("Computing Aggregated Statistics",
             pid=portalid,
             snapshot=snapshot)

    with Timer(key=portalid + '-agg', verbose=True):
        df = aggregateByPortal3(db, portalid, snapshot)

    data = {}
    with Timer(key=portalid + '-mean', verbose=True):
        if df.shape[0] != 0:

            for i in boolTypeCol:
                if df[i].dtype.name == 'bool':
                    df[i] = df[i].astype(int)
                else:
                    df[i] = df[i].replace(True, 1)
                    df[i] = df[i].replace(False, 0)
                #df[c]=df[c].apply(bool).astype(int)

            data = {
                k: float(str(v[['mean']]['mean'].round(decimals=2)))
                for k, v in dict(df.describe()).items()
            }
            data.update({
                k + 'N': int(v[['count']]['count'])
                for k, v in dict(df.describe()).items()
            })

    data['datasets'] = df.shape[0]
    PSQ = PortalSnapshotQuality(portalid=portalid, snapshot=snapshot, **data)
    db.add(PSQ)
    return PSQ
def portalResBody(snapshot, portalid):
    with Timer(key="get_portalRes", verbose=True):
        Session = current_app.config['dbsession']
        dbc = current_app.config['dbc']
        with Timer(key="query_portalRes", verbose=True):
            data = getResourceInfo(Session, dbc, portalid, snapshot)

        with Timer(key="query_getMetaResource", verbose=True):
            viewName = "view_{}_{}_{}".format('resinfo', portalid, snapshot)
            qorg = getResourceInfos(Session, snapshot, portalid)
            q = withView(qorg, viewName, Session, dbc)
            #print q
            start = time.time()
            data['uris'] = [row2dict(i) for i in q]
            end = time.time()
            if (end - start) > 5:
                print("Create View {}".format(viewName))
                createView(qorg, viewName, Session)

        data.update(getPortalInfos(Session, portalid, snapshot))
        #data['portals']= [ row2dict(r) for r in Session.query(Portal).all()]
        with Timer(key="query_portalResourceCount", verbose=True):
            r = current_app.config['dbsession'].query(
                Portal.resourcecount).filter(Portal.id == portalid)
            ps = []
            for P in r:
                data['resources'] = P[0]

        return render("odpw_portal_resources_list.jinja",
                      data=data,
                      snapshot=snapshot,
                      portalid=portalid)
def systemevolv():
    with Timer(key="get_systemevolv", verbose=True):
        Session = current_app.config['dbsession']

        with Timer(key="query_systemevolv", verbose=True):
            t = Session.query(
                PortalSnapshot.snapshot.label('snapshot'), Portal.software,
                PortalSnapshot.datasetcount,
                PortalSnapshot.resourcecount).join(Portal).subquery()
            q = Session.query(
                t.c.snapshot, t.c.software,
                func.count().label('count'),
                func.sum(t.c.resourcecount).label('resources'),
                func.sum(t.c.datasetcount).label('datasets')).group_by(
                    t.c.snapshot, t.c.software)
            data = [row2dict(r) for r in q.all()]
            df = pd.DataFrame(data)

        with Timer(key="plot_systemevolv", verbose=True):
            p = systemEvolutionPlot(df)
            script, div = components(p)

        js_resources = INLINE.render_js()
        css_resources = INLINE.render_css()

        return render("odpw_system_evolution.jinja",
                      plot_script=script,
                      plot_div=div,
                      js_resources=js_resources,
                      css_resources=css_resources)
def portalLinkCheckCSV(portalid):
    with Timer(key="get_portalLinkCheckCSV", verbose=True):
        si = StringIO.StringIO()
        cw = csv.writer(si)
        snapshot = getCurrentSnapshot()

        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)
        with Timer(key="query_portalorgas", verbose=True):
            q = Session.query(Dataset.organisation) \
                .filter(Dataset.portalid == portalid) \
                .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation)

            data['organisations'] = [row2dict(res) for res in q]

        for o in data['organisations']:
            orga = o['organisation']
            #    with Timer(key="query_orga-emails", verbose=True):
            #        portal=Session.query(Portal).filter(Portal.id==portalid).first()
            #        # print('portal: ', portal, 'snapshot: ', snapshot, 'orga: ', orga)
            #        data['contacts'] = contactPerOrga(Session, portal, snapshot, orga)
            #        for cont in data['contacts']:
            linkcheck = 'https://tools.adequate.at' + url_for(
                '.orga_resources',
                portalid=portalid,
                snapshot=snapshot,
                orga=orga)
            cw.writerow([orga, linkcheck])

        output = make_response(si.getvalue())
        output.headers[
            "Content-Disposition"] = "attachment; filename=export.csv"
        output.headers["Content-type"] = "text/csv"
        return output
def portalEvolution(snapshot, portalid):
    with Timer(key="get_portalEvolution", verbose=True):
        Session = current_app.config['dbsession']
        data = {}
        with Timer(key="query_portalEvolution", verbose=True):
            for R in Session.query(PortalSnapshot).filter(
                    PortalSnapshot.portalid == portalid):
                data[R.portalid + str(R.snapshot)] = row2dict(R)
            for R in Session.query(PortalSnapshotQuality).filter(
                    PortalSnapshotQuality.portalid == portalid):
                data[R.portalid + str(R.snapshot)].update(row2dict(R))

        df = pd.DataFrame([v for k, v in data.items()])
        with Timer(key="plot_portalEvolution", verbose=True):
            p = evolutionCharts(df)
            script, div = components(p)

            js_resources = INLINE.render_js()
            css_resources = INLINE.render_css()

        data = getPortalInfos(Session, portalid, snapshot)

        return render("odpw_portal_evolution.jinja",
                      plot_script=script,
                      plot_div=div,
                      js_resources=js_resources,
                      css_resources=css_resources,
                      snapshot=snapshot,
                      portalid=portalid,
                      data=data)
Exemple #8
0
 def getStatus(self, uri):
     with Timer(key="getStatus"):
         with Timer(key="HEADLookup"):
             headResp = self.headlookup(uri)
         if headResp.status_code == 400:
             with Timer(key="GETLookup"):
                 headResp = self.getlookup(uri)
         return headResp
Exemple #9
0
def aggregatePortalInfo(session, portalid, snapshot, dbc, limit=3):
    stats = {}
    with Timer(key=portalid + '-agg', verbose=True):
        ds = session.query(Dataset).filter(
            Dataset.snapshot == snapshot).filter(
                Dataset.portalid == portalid).count()
        rs = session.query(PortalSnapshot.resourcecount).filter(
            PortalSnapshot.portalid == portalid).filter(
                PortalSnapshot.snapshot == snapshot).first()
        print
        #print 'dsCount', ds
        #TODO fix resource count
        for key, cFunc, dFunc in [('organisation', organisationDist,
                                   distinctOrganisations),
                                  ('license', licenseDist, distinctLicenses),
                                  ('format', formatDist, distinctFormats)]:
            if key == 'format':
                total = row2dict(rs)['resourcecount']
            else:
                total = ds
            with Timer(key='query_{}-{}'.format(portalid, key), verbose=True):

                s = []

                viewName = "view_{}_{}_{}".format(key, portalid, snapshot)
                qorg = cFunc(session, snapshot, portalid=portalid)
                q = withView(qorg, viewName, session, dbc)
                start = time.time()
                if limit:
                    q = q.limit(limit)
                else:
                    q = q.all()
                for i in q:
                    d = row2dict(i)
                    #print d
                    d['perc'] = d['count'] / (1.0 * total) if total > 0 else 0
                    s.append(d)
                t = sum(item['count'] for item in s)
                #print key, 'total',t
                if ds - t != 0:
                    s.append({
                        key: 'Others',
                        'count': total - t,
                        'perc': (total - t) / (1.0 * total)
                    })
                end = time.time()
                if (end - start) > 5:
                    log.info("Create View {}".format(viewName))
                    createView(qorg, viewName, session)
                #q = withView(qorg, viewName, session, dbc)
                stats[key] = {
                    'distinct': dFunc(session, snapshot,
                                      portalid=portalid).count(),
                    'top3Dist': s
                }

    return stats
def portaldash():
    with Timer(key="get_portaldash", verbose=True):
        data = {}
        cursn = getSnapshotfromTime(datetime.datetime.now())
        Session = current_app.config['dbsession']
        with Timer(key="query_portaldash", verbose=True):
            data['portals'] = [
                row2dict(r) for r in Session.query(Portal).all()
            ]
        return render("odpw_portaldash.jinja", data=data, snapshot=cursn)
def portalQuality(snapshot, portalid):
    with Timer(key="portalQuality", verbose=True):

        Session = current_app.config['dbsession']
        df = portalSnapshotQualityDF(Session, portalid, snapshot)
        q = Session.query(PortalSnapshotQuality) \
            .filter(PortalSnapshotQuality.portalid == portalid) \
            .filter(PortalSnapshotQuality.snapshot == snapshot)
        qdata = None
        for r in q:
            qdata = row2dict(r)
            break
        d = []

        datasets = int(qdata['datasets'])
        for inD in qa:
            for k, v in inD['metrics'].items():
                k = k.lower()
                # TODO what to do if metric has no value?
                if qdata[k] != None and qdata[k] != 'None':
                    value = float(qdata[k])
                    perc = int(qdata[k + 'N']) / (datasets *
                                                  1.0) if datasets > 0 else 0
                    c = {
                        'Metric': k,
                        'Dimension': inD['dimension'],
                        'dim_color': inD['color'],
                        'value': value,
                        'perc': perc
                    }
                    c.update(v)
                    d.append(c)
        df = pd.DataFrame(d)
        with Timer(key="dataDF", verbose=True) as t:
            p = qualityChart(df)

        script, div = components(p)

        js_resources = INLINE.render_js()
        css_resources = INLINE.render_css()

        data = getPortalInfos(Session, portalid, snapshot)
        data['portals'] = [row2dict(r) for r in Session.query(Portal).all()]
        data['quality'] = qdata
        return render("odpw_portal_quality.jinja",
                      plot_script=script,
                      plot_div=div,
                      js_resources=js_resources,
                      css_resources=css_resources,
                      snapshot=snapshot,
                      portalid=portalid,
                      data=data,
                      qa=qa)
def portal(portalid, snapshot=getSnapshotfromTime(datetime.datetime.now())):
    with Timer(key="get_portal", verbose=True):
        current_sn = snapshot
        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)
        dynamicityEnabled = current_app.config.get('dynamicity', False)

        with Timer(key="query_portal", verbose=True):
            q = Session.query(Portal).filter(Portal.id == portalid) \
                .join(PortalSnapshotQuality, PortalSnapshotQuality.portalid == Portal.id) \
                .filter(PortalSnapshotQuality.snapshot == snapshot) \
                .join(PortalSnapshot, PortalSnapshot.portalid == Portal.id) \
                .filter(PortalSnapshot.snapshot == snapshot) \
                .add_entity(PortalSnapshot) \
                .add_entity(PortalSnapshotQuality)

            if dynamicityEnabled:
                q = q.join(PortalSnapshotDynamicity, PortalSnapshotDynamicity.portalid == Portal.id) \
                    .filter(PortalSnapshotDynamicity.snapshot == snapshot) \
                    .add_entity(PortalSnapshotDynamicity)
            r = q.first()
            while r is None:
                snapshot = getPreviousWeek(snapshot)
                q = Session.query(Portal).filter(Portal.id == portalid) \
                    .join(PortalSnapshotQuality, PortalSnapshotQuality.portalid == Portal.id) \
                    .filter(PortalSnapshotQuality.snapshot == snapshot) \
                    .join(PortalSnapshot, PortalSnapshot.portalid == Portal.id) \
                    .filter(PortalSnapshot.snapshot == snapshot) \
                    .add_entity(PortalSnapshot) \
                    .add_entity(PortalSnapshotQuality)

                if dynamicityEnabled:
                    q = q.join(PortalSnapshotDynamicity, PortalSnapshotDynamicity.portalid == Portal.id) \
                        .filter(PortalSnapshotDynamicity.snapshot == snapshot) \
                        .add_entity(PortalSnapshotDynamicity)
                r = q.first()

            data['portal'] = row2dict(r[0])
            data['fetchInfo'] = row2dict(r[1])
            data['fetchInfo']['duration'] = data['fetchInfo']['end'] - data[
                'fetchInfo']['start']

            if dynamicityEnabled:
                data['dynamicity'] = row2dict(r[3])
            data['quality'] = row2dict(r[2])

        #with Timer(key="query_portal_agg", verbose=True):
        #    data.update(aggregatePortalInfo(Session,portalid,snapshot,dbc))
        return render("odpw_portal.jinja",
                      snapshot=current_sn,
                      portalid=portalid,
                      data=data)
Exemple #13
0
    def checkUpdateRobots(self, uri):
        ttl = 36000
        with Timer(key="checkUpdateRobots"):
            log.info("Robots.txt", url=uri, thread=current_thread())

            canonical = hostname(uri)
            robots_url = roboturl(uri)

            while canonical in self.robots.robotsInProgress:
                #seems one thread is doing the parsing already
                sleep(5)

            if canonical not in self.robots.robots._cache:
                self.robots.robotsInProgress.add(canonical)
                with Timer(key="robots.fetch_parse"):
                    try:
                        # First things first, fetch the thing
                        log.info('HTTP_GET', uri=robots_url)
                        req = self.rsession.get(robots_url,
                                                timeout=10,
                                                allow_redirects=True,
                                                headers=headers)

                        # And now parse the thing and update
                        import reppy.parser
                        r = reppy.parser.Rules(robots_url, req.status_code,
                                               req.content,
                                               time.time() + ttl)
                        self.robots.robots.add(r)
                        delay = self.robots.robots.delay(uri, user_agent)
                        self.queue.addWait(uri, delay)
                    except Exception as e:
                        with Timer(key="robots.allowed.error"):
                            ErrorHandler.handleError(log,
                                                     "Robots.txt Exception",
                                                     exception=e,
                                                     url=uri,
                                                     thread=current_thread())
                            try:
                                import reppy.parser
                                r = reppy.parser.Rules(robots_url, 499, "",
                                                       time.time() + ttl)
                                self.robots.robots.add(r)
                            except Exception as e:
                                ErrorHandler.handleError(
                                    log,
                                    "Robots.txt ExException",
                                    exception=e,
                                    url=uri,
                                    thread=current_thread())
                self.robots.robotsInProgress.remove(canonical)
            return self.robots.robots.allowed(uri, user_agent)
Exemple #14
0
def aggregateByPortal1(db, portalid, snapshot):
    with Timer(key="qualityDF1", verbose=True):
        result = defaultdict(list)
        q = db.Session.query(Dataset).filter(
            Dataset.snapshot == snapshot).filter(Dataset.portalid == portalid)
        print str(q)
        print '-' * 50
        for d in q:
            with Timer(key="inspect1"):
                instance = inspect(d.data.quality)
            for key, x in instance.attrs.items():
                result[key].append(x.value)
        return pd.DataFrame(result)
def getPortalInfos(Session, portalid, snapshot):
    with Timer(key="get_getPortalInfos", verbose=True):
        with Timer(key="query_getPortalInfos", verbose=True):
            snapshots = [
                i[0] for i in Session.query(PortalSnapshot.snapshot).filter(
                    PortalSnapshot.portalid == portalid).all()
            ]

        p = getPreviousWeek(snapshot)
        p = p if p in snapshots else None
        n = getNextWeek(snapshot)
        n = n if n in snapshots else None
        data = {'snapshots': {'list': snapshots, 'prev': p, 'next': n}}
        return data
def orga_resource(portalid, snapshot, orga):
    with Timer(key="get_orga_resource", verbose=True):
        Session = current_app.config['dbsession']
        dbc = current_app.config['dbc']

        data = getResourceInfo(Session, dbc, portalid, snapshot, orga)
        q = getResourceInfos(Session, snapshot, portalid, orga)

        data['resList'] = []
        for i in q:
            dataset_id = i[1]
            dataset_title = i[2]
            orig_link = getDatasetURI(dataset_id, portalid)
            data['resList'].append({
                'uri': row2dict(i[0]),
                'dataset': {
                    'uri': orig_link,
                    'title': dataset_title
                }
            })

        data.update(getPortalInfos(Session, portalid, snapshot))
        r = current_app.config['dbsession'].query(
            Portal.resourcecount).filter(Portal.id == portalid)
        for P in r:
            data['resources'] = P[0]

        return render("odpw_portal_resources_list.jinja",
                      data=data,
                      snapshot=snapshot,
                      portalid=portalid)
Exemple #17
0
def systemEvolutionBarPlot(df, yLabel, values):
    with Timer(key='systemEvolutionBarPlot', verbose=True):
        p = Bar(df, label='snapshot', values=values, agg='sum', stack='software',
            legend='bottom_left', bar_width=0.5, xlabel="Snapshots", ylabel=yLabel, responsive=True, height=200,tools='hover')

        glyph_renderers = p.select(GlyphRenderer)
        bar_source = [glyph_renderers[i].data_source for i in range(len(glyph_renderers))]
        hover = p.select(HoverTool)
        hover.tooltips = [
            ('software',' @software'),
            ('value', '@height'),
        ]
        p.xaxis.formatter=FuncTickFormatter.from_py_func(getWeekStringTick)
        p.axis.minor_tick_line_color = None

        p.background_fill_color = "#fafafa"
        p.legend.location = "top_left"
        p.toolbar.logo = None
        p.toolbar_location = None

        legend=p.legend[0].legends
        p.legend[0].legends=[]
        l = Legend( location=(0, -30))
        l.items=legend
        p.add_layout(l, 'right')

        return p
def portalLicenses(snapshot, portalid):
    with Timer(key="get_portalLicenseDist", verbose=True):
        Session = current_app.config['dbsession']
        dbc = current_app.config['dbc']
        data = getPortalInfos(Session, portalid, snapshot)
        with Timer(key="query_portalLicenseDist", verbose=True):
            data['portals'] = [
                row2dict(r) for r in Session.query(Portal).all()
            ]
        data.update(
            aggregatePortalInfo(Session, portalid, snapshot, dbc, limit=None))

        return render("odpw_portal_dist.jinja",
                      data=data,
                      snapshot=snapshot,
                      portalid=portalid)
def portalreport(portalid,
                 snapshot=getSnapshotfromTime(datetime.datetime.now())):
    with Timer(key="get_portal", verbose=True):

        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)
        with Timer(key="query_portalreport", verbose=True):
            q = Session.query(Dataset.organisation) \
                .filter(Dataset.portalid == portalid) \
                .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation)

            data['organisations'] = [row2dict(res) for res in q]
        return render("odpw_portal_report.jinja",
                      snapshot=snapshot,
                      portalid=portalid,
                      data=data)
def systemchanges():
    with Timer(key="get_systemchanges"):

        Session = current_app.config['dbsession']
        cursn = getSnapshotfromTime(datetime.datetime.now())
        prevWeek = getPreviousWeek(cursn)

        with Timer(key="query_systemchanges"):
            data_cur = {
                r.portalid: r
                for r in Session.query(PortalSnapshot).filter(
                    PortalSnapshot.snapshot == cursn)
            }
            data_prev = {
                r.portalid: r
                for r in Session.query(PortalSnapshot).filter(
                    PortalSnapshot.snapshot == prevWeek)
            }

        data = {'status_change': {}, 'ds_change': {}, 'res_change': {}}
        for pid, ps in data_cur.items():
            if pid in data_prev:
                if ps.status == data_prev[pid].status:
                    if ps.datasetcount != data_prev[pid].datasetcount:
                        dsfrom = data_prev[pid].datasetcount if data_prev[
                            pid].datasetcount is not None else 0
                        dsto = ps.datasetcount if ps.datasetcount is not None else 0
                        data['ds_change'][pid] = {'from': dsfrom, 'to': dsto}
                    elif ps.resourcecount != data_prev[pid].resourcecount:
                        resfrom = data_prev[pid].resourcecount if data_prev[
                            pid].resourcecount is not None else 0
                        resto = ps.resourcecount if ps.resourcecount is not None else 0
                        data['res_change'][pid] = {
                            'from': resfrom,
                            'to': resto
                        }
                else:
                    data['status_change'][pid] = {
                        'from': data_prev[pid].status,
                        'to': ps.status
                    }

        data['from'] = prevWeek
        data['to'] = cursn

        return render("odpw_system_changes.jinja", data=data)
Exemple #21
0
def cli(args, dbm):

    sn = getCurrentSnapshot()
    db = DBClient(dbm)

    batch = args.batch
    concurrent = args.threads
    delay = args.delay

    log.info("START HEAD", batch=batch, delay=delay, threads=concurrent)

    rsession = requests.Session()
    robots = RobotsManager(rsession)

    q = DomainQueue(args.delay)

    filler = QueueFiller(db, q, robots, batch * 2, sn, concurrent)
    filler.daemon = True
    filler.filling_queue(batch=batch)

    resultQueue = Queue(maxsize=0)
    #start worker threads
    for i in range(concurrent):
        t = Worker(q=q,
                   resultQueue=resultQueue,
                   robots=robots,
                   rsession=rsession,
                   sn=sn)
        t.daemon = True
        t.start()

    filler.start()

    inserter = Inserter(db=db,
                        resultQueue=resultQueue,
                        domainQueue=q,
                        batch=batch / 2)
    inserter.start()

    filler.join()
    inserter.join()
    Timer.printStats()

    import sys
    sys.exit(0)
def systemfetch():
    with Timer(key="get_systemfetch"):
        Session = current_app.config['dbsession']

        cursn = getSnapshotfromTime(datetime.datetime.now())
        snapshots = getLastNSnapshots(cursn, n=5)
        nWeeksago = snapshots[-1]

        cnts = defaultdict(int)
        data = {}
        with Timer(key="query_systemfetch"):
            for r in Session.query(PortalSnapshot.snapshot,
                                   PortalSnapshot.start, PortalSnapshot.end -
                                   PortalSnapshot.start).filter(
                                       PortalSnapshot.snapshot > nWeeksago):
                sn, start, dur = r[0], r[1], r[2]
                cnts[sn] += 1

                d = data.setdefault(sn, {})
                if dur is not None:
                    ds = d.setdefault(start, [])
                    ds.append(dur.total_seconds())

        for sn, d in data.items():
            dd = []
            gstart = min(d.keys())

            for start, durations in d.items():
                for dur in durations:
                    delta = (start - gstart).total_seconds() + dur
                    dd.append(delta)
            data[sn] = dd

        with Timer(key="plot_systemfetch"):
            p = fetchProcessChart(data, cnts)
            script, div = components(p)

            js_resources = INLINE.render_js()
            css_resources = INLINE.render_css()

        return render("odpw_system_fetch.jinja",
                      plot_script=script,
                      plot_div=div,
                      js_resources=js_resources,
                      css_resources=css_resources)
def portalOrgareport(portalid,
                     orga,
                     snapshot=getSnapshotfromTime(datetime.datetime.now())):
    with Timer(key="get_portal", verbose=True):

        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)

        with Timer(key="query_portalreport", verbose=True):
            portal = Session.query(Portal).filter(
                Portal.id == portalid).first()
            data['contacts'] = contactPerOrga(Session, portal, snapshot, orga)

        return render("odpw_portal_report_contacts.jinja",
                      snapshot=snapshot,
                      portalid=portalid,
                      data=data,
                      organisation=orga)
Exemple #24
0
def createDatasetQuality(P, md5v, dataset):
    with Timer(key='quality'):

        q = {}
        for id, qa in dcat_analyser().items():
            q[qa.id.lower()] = qa.analyse_Dataset(dataset)

        DQ = DatasetQuality(md5=md5v, **q)
        return DQ
Exemple #25
0
    def get(self, portalid):
        with Timer(key="PortalSnapshots.get",verbose=True):
            session=current_app.config['dbsession']

            q=session.query(PortalSnapshot.snapshot)\
                .filter(PortalSnapshot.portalid==portalid)
            data=[row2dict(r) for r in q.all()]

            return jsonify(data)
def resourceInfo(snapshot, portalid, uri):
    with Timer(key="get_resourceInfo", verbose=True):
        #print snapshot,portalid,uri

        Session = current_app.config['dbsession']
        dbc = current_app.config['dbc']
        data = getPortalInfos(Session, portalid, snapshot)

        with Timer(key="query_resources", verbose=True):
            viewName = "view_{}_{}_{}".format('resinfo', portalid, snapshot)
            qorg = getResourceInfos(Session, snapshot, portalid)
            q = withView(qorg, viewName, Session, dbc)
            start = time.time()
            data['resources'] = [row2dict(r) for r in q.all()]
            end = time.time()
            if (end - start) > 5:
                print("Create View {}".format(viewName))
                try:
                    createView(qorg, viewName, Session)
                except Exception as e:
                    if 'already exists' in e.message:
                        pass
                    else:
                        raise e

        with Timer(key="query_resourceInfo", verbose=True):
            q = Session.query(ResourceInfo) \
                .filter(ResourceInfo.uri == uri)
            #print q
            data['resourceInfo'] = [row2dict(r) for r in q.all()]

            for r in data['resourceInfo']:
                if 'header' in r:
                    if r['header'] is None:
                        r['header'] = ""
                    else:
                        #print type(r['header']),r['header'],r
                        r['header'] = ast.literal_eval(str(r['header']))

        return render("odpw_portal_resource.jinja",
                      snapshot=snapshot,
                      portalid=portalid,
                      uri=uri,
                      data=data)
Exemple #27
0
def createMetaResources(md5v, dataset):
    with Timer(key='createMetaResources'):
        res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs(
            dataset)
        bulk_mr = []
        uris = []
        for uri in res:
            valid = True
            try:
                uri = urlnorm.norm(uri.strip())
            except Exception as e:
                log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message)
                uri = uri
                valid = False

            f = getDistributionFormatWithURL(dataset, uri)
            m = getDistributionMediaTypeWithURL(dataset, uri)
            s = getDistributionSizeWithURL(dataset, uri)
            c = getDistributionCreationDateWithURL(dataset, uri)
            mod = getDistributionModificationDateWithURL(dataset, uri)
            try:
                s_uri = safe_url_string(uri, 'utf-8')
                uri = escape_ajax(s_uri)
            except Exception as exc:
                ErrorHandler.handleError(log,
                                         "safe_url_string",
                                         exception=exc,
                                         md5=md5,
                                         uri=uri,
                                         exc_info=True)
                uri = uri

            if uri in uris:
                log.debug("WARNING, duplicate URI",
                          dataset=dataset.id,
                          md5=md5v,
                          uri=uri,
                          format=f,
                          media=m)
                continue
            try:
                s = int(float(s)) if s is not None else None
            except Exception as e:
                s = None

            MR = MetaResource(uri=uri,
                              md5=md5v,
                              media=m,
                              valid=valid,
                              format=normaliseFormat(f),
                              size=s,
                              created=toDatetime(c),
                              modified=toDatetime(mod))
            bulk_mr.append(MR)
            uris.append(uri)
        return bulk_mr
def getPortalDatasets(Session, portalid, snapshot):
    with Timer(key="getPortalDatasets", verbose=True):
        return {
            "datasets": [
                row2dict(r)
                for r in Session.query(Dataset.title, Dataset.id).filter(
                    Dataset.portalid == portalid).filter(
                        Dataset.snapshot == snapshot).all()
            ]
        }
def portalsquality():
    with Timer(key="get_portalsquality", verbose=True):

        Session = current_app.config['dbsession']
        snapshot = getSnapshotfromTime(datetime.datetime.now())

        with Timer(key="query_portalsquality"):
            results = [
                row2dict(r) for r in Session.query(
                    Portal, Portal.datasetcount,
                    Portal.resourcecount).join(PortalSnapshotQuality).filter(
                        PortalSnapshotQuality.snapshot == snapshot).add_entity(
                            PortalSnapshotQuality)
            ]

        keys = [i.lower() for q in qa for i in q['metrics']]
        df = pd.DataFrame(results)

        #print df
        for c in keys:
            #print c,df[c]
            #print '___'*10
            df[c] = df[c].convert_objects(convert_numeric=True)

        dfiso = df.groupby(['iso'])
        dfiso=dfiso.agg('mean')\
             .join(pd.DataFrame(dfiso.size(),columns=['count']))
        resultsIso = dfiso.reset_index().to_dict(orient='records')

        dfsoft = df.groupby(['software'])
        dfsoft=dfsoft.agg('mean')\
             .join(pd.DataFrame(dfsoft.size(),columns=['count']))
        resultSoft = dfsoft.reset_index().to_dict(orient='records')

        return render('odpw_portals_quality.jinja',
                      data={
                          'portals': results,
                          'iso': resultsIso,
                          'soft': resultSoft
                      },
                      keys=keys,
                      snapshot=snapshot)
Exemple #30
0
    def run(self):
        log.info("Started", thread=current_thread())
        while True:
            with Timer(key="doWork"):
                try:
                    with Timer(key="q.get"):
                        uri = self.queue.get()
                    if uri is None:
                        break
                    try:
                        #check if this is a valid URL, if we have an exception, done
                        o = urlparse(uri)
                        if o.scheme.startswith('http'):
                            res = None
                            if self.checkUpdateRobots(uri):
                                res = self.getStatus(uri)
                            else:
                                log.info("ROBOTS DENIED",
                                         uri=uri,
                                         thread=current_thread())
                            self.handle_request(uri, res, None)
                        else:
                            raise InvalidSchema(
                                "No connection adapters were found for " + uri)
                    except Exception as e:
                        ErrorHandler.handleError(log,
                                                 exception=e,
                                                 msg="doWork",
                                                 exc_info=True,
                                                 url=uri,
                                                 thread=current_thread())
                        self.handle_request(uri, None, e)

                except Exception as e:
                    ErrorHandler.handleError(log,
                                             exception=e,
                                             msg="uncaught in doWork",
                                             exc_info=True,
                                             thread=current_thread())

        log.info("STOPPED", thread=current_thread())