def test_dates(self): meta = Metadata() meta.add_date('today') assert len(meta.dates) == 0, meta.dates meta = Metadata() meta.add_date('2001-01-20') assert len(meta.dates) == 1, meta.dates meta.add_date('2001-01-20') assert len(meta.dates) == 1, meta.dates meta.add_date('2002-01-20') assert len(meta.dates) == 2, meta.dates
def metadata(): enable_cache(server_side=False) schemata = {} for schema_id, schema in resolver.store.items(): if not schema_id.endswith('#'): schema_id = schema_id + '#' schemata[schema_id] = { 'id': schema_id, 'title': schema.get('title'), 'faIcon': schema.get('faIcon'), 'plural': schema.get('plural', schema.get('title')), 'description': schema.get('description'), 'inline': schema.get('inline', False) } return jsonify({ 'status': 'ok', 'app': { 'title': get_app_title(), 'url': get_app_url(), 'samples': get_config('SAMPLE_SEARCHES') }, 'fields': Metadata.facets(), 'categories': Collection.CATEGORIES, 'countries': COUNTRY_NAMES, 'languages': LANGUAGE_NAMES, 'schemata': schemata })
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id self._meta['crawler'] = self.crawler self._meta['crawler_run'] = self.crawler_run return Metadata.from_data(self._meta or {})
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta['mime_type'] = storage.mimetype file_meta['file_name'] = storage.filename validate(file_meta, 'metadata.json#') file_meta = Metadata.from_data(file_meta) file_meta.crawler_id = 'user_upload:%s' % request.auth_role.id file_meta.crawler_run = make_textid() sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection.id, file_meta, sec_fn, move=True, queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) metas.append(file_meta) return jsonify({'status': 'ok', 'metadata': metas})
def test_normalize(self): for number in PHONE_NUMBERS: meta = Metadata.from_data({'countries': ['de']}) analyzer = PhoneNumberAnalyzer(None, meta) analyzer.prepare() analyzer.on_text(number) analyzer.finalize()
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) request.authz.require(request.authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get('meta', '{}')) meta['crawler_id'] = 'user_upload:%s' % request.authz.role.id meta['crawler_run'] = make_textid() except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta['mime_type'] = storage.mimetype file_meta['file_name'] = storage.filename file_meta['source_path'] = storage.filename validate(file_meta, 'metadata.json#') file_meta = Metadata.from_data(file_meta) sec_fn = os.path.join(upload_folder, secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection_id, file_meta, sec_fn, move=True, queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) metas.append(file_meta) return jsonify({'status': 'ok', 'metadata': metas})
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) if meta.foreign_id is None: meta.foreign_id = url tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600**self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600**self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def test_emails(self): meta = Metadata() meta.add_email('*****@*****.**') assert len(meta.emails) == 1, meta.urls assert len(meta.domains) == 1, meta.domain assert meta.domains[0] == 'pudo.org', meta.domains meta = Metadata() meta.add_email('not-an-email') assert len(meta.emails) == 0, meta.emails
def test_basic_functions(self): meta = Metadata.from_data({ 'file_name': 'foo.doc', 'title': ' ', 'languages': ['en', 'xx'] }) assert meta.file_name == 'foo.doc', meta.file_name assert meta.title == 'foo.doc', meta.title assert not len(meta.countries), meta.countries
def ingest_url(source_id, metadata, url): meta = Metadata(data=metadata) try: fh, tmp_path = mkstemp() os.close(fh) log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: raise Exception("HTTP Error %r: %r" % (url, res.status_code)) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(source_id, meta) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def test_urls(self): meta = Metadata() meta.urls = ['http://google.com'] assert len(meta.urls) == 1, meta.urls assert len(meta.domains) == 1, meta.domain assert meta.domains[0] == 'google.com', meta.domains meta.add_url('http://') assert len(meta.urls) == 1, meta.urls meta.add_url('http://www.google.com/xxx') assert len(meta.urls) == 2, meta.urls assert len(meta.domains) == 1, meta.domain
def metadata(): enable_cache(server_side=False) return jsonify({ 'status': 'ok', 'maintenance': request.authz.in_maintenance, 'app': { 'title': six.text_type(app_title), 'url': six.text_type(app_url), 'samples': get_config('SAMPLE_SEARCHES') }, 'fields': Metadata.facets(), 'categories': get_config('COLLECTION_CATEGORIES', {}), 'countries': COUNTRY_NAMES, 'languages': LANGUAGE_NAMES, 'schemata': schemata })
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get("meta", "{}")) except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta["mime_type"] = storage.mimetype file_meta["file_name"] = storage.filename validate(file_meta, "metadata.json#") file_meta = Metadata.from_data(file_meta) file_meta.crawler_id = "user_upload:%s" % request.auth_role.id file_meta.crawler_run = make_textid() sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection.id, file_meta, sec_fn, move=True) metas.append(file_meta) return jsonify({"status": "ok", "metadata": metas})
def ingest_url(collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True, timeout=120) if res.status_code >= 400: msg = "HTTP Error %r: %r" % (url, res.status_code) raise IngestorException(msg) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(collection_id, metadata): meta = Metadata.from_data(metadata) Ingestor.dispatch(collection_id, meta)
def ingest(source_id, metadata): meta = Metadata(data=metadata) Ingestor.dispatch(source_id, meta)
def make_meta(self, data={}): data = json.loads(json.dumps(data)) data['crawler'] = self.get_id() data['crawler_run'] = self.crawler_run return Metadata(data=data)
def ingest(collection_id, metadata): meta = Metadata(data=metadata) Ingestor.dispatch(collection_id, meta)
def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id return Metadata.from_data(self._meta or {})
def make_meta(self, data={}): data = json.loads(json.dumps(data)) meta = Metadata.from_data(data) meta.crawler = self.get_id() meta.crawler_run = self.crawler_run return meta