Beispiel #1
0
def change_history(obj):
    P, dbConf, snapshot = obj[0],obj[1],obj[2]
    log.info("ChangeHistory", portalid=P.id, snapshot=snapshot)

    dbm = DBManager(**dbConf)
    db = DBClient(dbm)

    try:
        for res in db.getMetaResource(snapshot, portalid=P.id):
            # metadata modification date
            meta_lm = res.modified
            if meta_lm:
                reshist = ResourceHistory(uri=res.uri, md5=res.md5, snapshot=snapshot, modified=meta_lm, source='metadata')
                #db.add(reshist)
            if db.exist_resourceinfo(uri=res.uri, snapshot=snapshot):
                resInfo = db.getResourceInfoByURI(uri=res.uri, snapshot=snapshot).one()
                header = resInfo.header
                header_lm = None
                if 'last-modified' in header:
                    header_lm = header['last-modified'][0]
                elif 'Last-Modified' in header:
                    header_lm = header['Last-Modified'][0]
                try:
                    header_lm = dateutil.parser.parse(header_lm)
                except Exception as e:
                    header_lm = None
                if header_lm:
                    reshist = ResourceHistory(uri=res.uri, md5=res.md5, snapshot=snapshot, modified=header_lm, source='header')
                    #db.add(reshist)

                # TODO comparison based: ETag
                etag = None
                if 'etag' in header:
                    etag = header['etag'][0]
                elif 'ETag' in header:
                    etag = header['ETag'][0]



            # compute freshness scores for header and metadata
            fresh_scores = {}
            changes = list(db.getResourcesHistory(uri=res.uri, md5=res.md5, source='header'))
            h_scores = freshness_score(changes, snapshot)
            if h_scores:
                for s in h_scores:
                    fresh_scores[s + '_header'] = h_scores[s]
            changes = list(db.getResourcesHistory(uri=res.uri, md5=res.md5, source='metadata'))
            m_scores = freshness_score(changes, snapshot)
            if m_scores:
                for s in m_scores:
                    fresh_scores[s + '_metadata'] = m_scores[s]
            db.add(ResourceFreshness(uri=res.uri, md5=res.md5, snapshot=snapshot, **fresh_scores))

        status = 200
        exc = None
    except Exception as exc:
        ErrorHandler.handleError(log, "FreshnessException", exception=exc, pid=P.id, snapshot=snapshot, exc_info=True)
        status = getExceptionCode(exc)
        exc = getExceptionString(exc)

    return (P, snapshot)
Beispiel #2
0
    def process_response(self, request, response, spider):
        status = response.status
        request.meta['contentchanged'] = -1

        if 'robots.txt' in response.url:
            self.stats.inc_value('robots')
            self.stats.inc_value('robot_status/' + str(response.status))

        if status == 200 and 'robots.txt' not in response.url:

            if hasattr(response, 'body_as_unicode'):
                content = response.body_as_unicode()
            else:
                content = response.body
            #build new response
            request.meta['error'] = None

            if 'domain' not in request.meta:
                domain = ''
                try:
                    parsed_uri = urlparse(response.url)
                    domain = '{uri.netloc}'.format(uri=parsed_uri)
                except:
                    domain = 'error'
                request.meta['domain'] = domain
            #create folder and file
            domain = request.meta['domain']

            #compute digest and filesize
            digest = hashlib.md5(content).hexdigest()
            request.meta['digest'] = digest
            request.meta['size'] = sys.getsizeof(content)

            #check if digest exists?, if yes, get file location and file size
            last_digest = spider.api.getLastDigest(uri=response.url)

            request.meta['contentchanged'] = 0 if last_digest and last_digest[
                0] == digest else 1

            try:
                filename = request.meta['git']
                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))

                with open(filename, 'wb') as fw:
                    try:
                        fw.write(content)
                    except Exception as e:
                        request.meta['error'] = getExceptionString(e)
                        ErrorHandler.handleError("Writing file", exception=e)
            except Exception as e:
                ErrorHandler.handleError(log,
                                         'file_download',
                                         exception=e,
                                         uri=request.url)
                request.meta['error'] = getExceptionString(e)
                status = 606
                return Response(url=request.url,
                                status=status,
                                headers=response.headers,
                                request=request)

        r = Response(url=response.url,
                     status=response.status,
                     headers=response.headers,
                     request=request)
        return r
Beispiel #3
0
def fetchHttp(obj):
    P, dbConf, snapshot, store_local = obj[0], obj[1], obj[2], obj[3]
    log.info("HTTPInsert", portalid=P.id, snapshot=snapshot)

    dbm = DBManager(**dbConf)
    db = DBClient(dbm)

    with Timer(key='InsertPortal', verbose=True):
        PS = PortalSnapshot(portalid=P.id, snapshot=snapshot)
        PS.start = datetime.datetime.now()
        sleep(randint(1, 10))
        db.add(PS)
        try:

            processor = getPortalProcessor(P)
            iter = processor.generateFetchDatasetIter(P, PS, snapshot)
            insertDatasets(P, db, iter, snapshot, store_local=store_local)
            status = 200
            exc = None
            db.commit()
        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "PortalFetchException",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)
            status = getExceptionCode(exc)
            exc = getExceptionString(exc)
        try:
            #update the portalsnapshot object with dataset and resource count and end time
            dsCount = PS.datasetcount
            dsfetched = db.Session.query(Dataset).filter(
                Dataset.snapshot == snapshot).filter(
                    Dataset.portalid == P.id).count()
            resCount = db.Session.query(Dataset).filter(
                Dataset.snapshot == snapshot).filter(
                    Dataset.portalid == P.id).join(
                        MetaResource, MetaResource.md5 == Dataset.md5).count()

        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "PortalSnapshotUpdate",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)
        try:
            s = db.Session

            PS = s.query(PortalSnapshot).filter(
                PortalSnapshot.portalid == P.id,
                PortalSnapshot.snapshot == snapshot).first()
            PS.datasetsfetched = dsfetched
            PS.resourcecount = resCount
            PS.datasetcount = dsCount
            PS.end = datetime.datetime.now()
            PS.exc = exc
            PS.status = status
            s.commit()
            #s.flush()
            s.remove()
        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "PortalSnapshotUpdate",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)
        try:
            aggregatePortalQuality(db, P.id, snapshot)
        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "PortalFetchAggregate",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)

        # compute dynamicity stats
        try:
            sn = [
                ps.snapshot for ps in db.Session.query(PortalSnapshot).filter(
                    PortalSnapshot.portalid == P.id)
            ]
            sn = sorted(sn)
            sn_i = sn.index(snapshot)
            dynamicity.dynPortal(db, P, snapshot, sn[sn_i - 1])
        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "PortalDynamicity",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)

    return (P, snapshot)
Beispiel #4
0
    def handle_request(self, uri, response, error):
        with Timer(key="handle_request") as t:
            try:
                r = {
                    'snapshot': self.snapshot,
                    'uri': uri,
                    'timestamp': datetime.datetime.now(),
                    'status': None,
                    'exc': None,
                    'header': None,
                    'mime': None,
                    'size': None
                }

                #robots
                if response is None and error is None:
                    r['status'] = 666
                elif error is not None:
                    ErrorHandler.handleError(log,
                                             "handle_request",
                                             exception=error,
                                             url=uri,
                                             snapshot=self.snapshot,
                                             exc_info=True)
                    r['status'] = getExceptionCode(error)
                    r['exc'] = getExceptionString(error)
                else:
                    with Timer(key="header_dict") as t:
                        header_dict = dict(
                            (k.lower(), v)
                            for k, v in dict(response.headers).iteritems())

                        if 'content-type' in header_dict:
                            r['mime'] = extractMimeType(
                                header_dict['content-type'])
                        else:
                            r['mime'] = 'missing'

                        r['status'] = response.status_code
                        r['header'] = header_dict

                        if response.status_code == 200:
                            if 'content-length' in header_dict:
                                r['size'] = header_dict['content-length']
                            else:
                                r['size'] = 0

                RI = ResourceInfo(**r)
                self.resultQueue.put(RI)
                log.info("PROCESSED",
                         uri=uri,
                         status=RI.status,
                         thread=current_thread())
            except Exception as e:
                ErrorHandler.handleError(log,
                                         "Processed Exception",
                                         exception=e,
                                         url=uri,
                                         snapshot=self.snapshot,
                                         exc_info=True,
                                         thread=current_thread())