def crawl_source(slug, ignore_tags=False): Source.sync() source = Source.by_slug(slug) if source is None: raise ValueError("Invalid source: %r" % slug) source.crawler_instance.ignore_tags = ignore_tags source.crawler_instance.crawl()
def authz_sources(action): if action == 'read' and request.authz_sources.get('read') is None: request.authz_sources['read'] = Source.list_user_slugs(current_user) if action == 'write' and request.authz_sources.get('write') is None: request.authz_sources['write'] = Source.list_user_slugs( current_user, include_public=False) # noqa return request.authz_sources[action] or []
def authz_sources(action): if action == 'read' and request.authz_sources.get('read') is None: request.authz_sources['read'] = Source.list_user_slugs(current_user) if action == 'write' and request.authz_sources.get('write') is None: request.authz_sources['write'] = Source.list_user_slugs(current_user, include_public=False) # noqa return request.authz_sources[action] or []
def crawl_source(slug, ignore_tags=False): logging.debug('crawl source -- celery task going') Source.sync() source = Source.by_slug(slug) if source is None: raise ValueError("Invalid source: %r" % slug) source.crawler_instance.ignore_tags = ignore_tags source.crawler_instance.crawl()
def analyze(foreign_id=None): """Re-analyze documents in the given source (or throughout).""" if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) analyze_source.delay(source.id) else: for source in Source.all(): analyze_source.delay(source.id)
def get_results(query, limit): sources = {} for i, row in enumerate(scan_iter(query)): if i >= limit: return data = { 'file_url': url_for('documents_api.file', document_id=row.get('_id')) } for name, value in row.get('_source').items(): if name == 'source_id': if value not in sources: source = Source.by_id(value) if source is None: sources[value] = '[Deleted source %s]' % value else: sources[value] = source.label value = sources[value] name = 'source' if name not in FIELDS: continue if isinstance(value, (list, tuple, set)): value = ', '.join(value) data[name] = value yield data
def format_results(query): sources = {} entities = {} results = [] for row in raw_iter(query): src = row.get('_source') data = {} for name, value in src.items(): if isinstance(value, dict) or name in SKIP_FIELDS: continue if name == 'entities': load_ids = [] for entity_id in value: if entity_id not in entities: load_ids.append(entity_id) if len(load_ids): for id, ent in Entity.by_id_set(load_ids).items(): entities[id] = ent.name value = ', '.join([entities.get(e) for e in value if entities.get(e) is not None]) if isinstance(value, (list, tuple, set)): value = ', '.join(value) if name == 'source_id': # WARNING: don't to one query per row if value not in sources: source = Source.by_id(value) if source is None: sources[value] = '[Deleted source %s]' % value else: sources[value] = source.label value = sources[value] data[name] = value results.append(data) return results
def update(slug): authz.require(authz.source_write(slug)) source = obj_or_404(Source.by_slug(slug)) source.update(request_data(), current_user) db.session.add(source) db.session.commit() return view(slug)
def crawl(slug): logging.debug('starting a crawl of %s' % slug) authz.require(authz.source_write(slug)) source = obj_or_404(Source.by_slug(slug)) crawl_source.delay(source.slug) logging.debug('started crawl') return jsonify({'status': 'ok'})
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}}) q = {'query': q, 'size': 150} result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def crawl(self, directory=None, source=None, meta={}): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) db.session.commit() source_id = source.id if os.path.isfile(directory): self.crawl_file(source_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [ d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES ] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(source_id, file_path, meta)
def update(id): authz.require(authz.source_write(id)) source = obj_or_404(Source.by_id(id)) source.update(request_data()) db.session.add(source) db.session.commit() return view(id)
def test_delete_source(self): source = Source.by_id(1000) res = self.client.get('/api/1/query?q="mention fruit"') assert res.json['total'] == 1, res.json delete_source(source.id) optimize_search() res = self.client.get('/api/1/query?q="mention fruit"') assert res.json['total'] == 0, res.json
def source(self): if not hasattr(self, '_source'): self._source = Source.create({ 'foreign_id': self.SOURCE_ID, 'label': self.SOURCE_LABEL or self.SOURCE_ID }) db.session.commit() return self._source
def crawl(self): for base_url in SITES: print 'Working on base_url: {}'.format(base_url) self.attributes = SITES[base_url] self.label = self.attributes['label'] Source.create({'label': self.label, 'foreign_id': 'blacklight'}) db.session.commit() self.failed_articles = 0 page_count = self.get_page_count(base_url) print "Pages: {}".format(page_count) page_number = 1 while (page_number <= page_count): if self.failed_articles >= FAILED_LIMIT: log.warning('Failure limit reach: {}'.format(FAILED_LIMIT)) break self.crawl_page(base_url, page_number, page_count) page_number += 1
def flush(foreign_id): """Reset the crawler state for a given source specification.""" from aleph.index import delete_source source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) delete_source(source.id) source.delete() db.session.commit()
def source(self): if not hasattr(self, '_source'): self._source = Source.create({ 'foreign_id': self.SOURCE_ID, 'label': self.SOURCE_LABEL or self.SOURCE_ID }) db.session.commit() db.session.add(self._source) return self._source
def crawl_source(self, engine, foreign_id, data): source = Source.create({ 'foreign_id': foreign_id, 'label': data.get('label') }) db.session.commit() meta_base = data.get('meta', {}) for name, query in data.get('queries', {}).items(): self.crawl_query(engine, source, meta_base, name, query)
def view(slug): authz.require(authz.source_read(slug)) source = obj_or_404(Source.by_slug(slug)) etag_cache_keygen(source) data = source.to_dict() data['can_write'] = authz.source_write(slug) if data['can_write']: data['users'] = [u.id for u in source.users] data['config'] = source.config return jsonify(data)
def index(): sources = [] latest = set() for source in Source.all_by_user(current_user): data = source.to_dict() data['can_write'] = authz.source_write(source.slug) latest.add(data['updated_at']) sources.append(data) etag_cache_keygen(max(latest)) return jsonify({'results': sources, 'total': len(sources)})
def view(slug): authz.require(authz.source_read(slug) and authz.is_admin()) source = obj_or_404(Source.by_slug(slug)) etag_cache_keygen(source) data = source.to_dict() data['can_write'] = authz.source_write(slug) if data['can_write']: data['users'] = [u.id for u in source.users] data['config'] = source.config return jsonify(data)
def index(): sources = [] latest = set() for source in Source.all_by_user(current_user): data = source.to_dict() data['can_write'] = authz.source_write(source.slug) latest.add(data['updated_at']) sources.append(data) if len(latest): etag_cache_keygen(max(latest)) return jsonify({'results': sources, 'total': len(sources)})
def crawl(self): for base_url in SITES: print 'Working on base_url: {}'.format(base_url) self.attributes = SITES[base_url] self.label = self.attributes['label'] Source.create({ 'label': self.label, 'foreign_id': 'blacklight' }) db.session.commit() self.failed_articles = 0 page_count = self.get_page_count(base_url) print "Pages: {}".format(page_count) page_number = 1 while (page_number <= page_count): if self.failed_articles >= FAILED_LIMIT: log.warning('Failure limit reach: {}'.format(FAILED_LIMIT)) break self.crawl_page(base_url, page_number, page_count) page_number += 1
def index(foreign_id=None): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: index_document.delay(doc_id) if foreign_id is None: reindex_entities()
def index(foreign_id=None): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) else: delete_index() init_search() for doc_id, in q: index_document.delay(doc_id)
def dc_projects(): slug = request.args.get('source') authz.require(authz.source_read(slug)) source = obj_or_404(Source.by_slug(slug)) if not isinstance(source.crawler_instance, DocumentCloudCrawler): return jsonify({'credentials': False}) username = request.args.get('username') password = request.args.get('password') projects = source.crawler_instance.get_projects(username, password) if projects is False: return jsonify({'credentials': False}) else: return jsonify({'credentials': True, 'projects': projects})
def process_row(row, attributes): src = row.get('_source') data = {} for name in attributes: value = src.get(name) for attr in src.get('attributes', []): if attr.get('name') == name: value = attr.get('value') if name == 'entities': objs = Entity.by_id_set([e.get('id') for e in value]) value = ', '.join([o.label for o in objs.values()]) if name == 'collection': # WARNING: don't to one query per row value = unicode(Source.by_slug(value) or value) data[name] = value return data
def setUp(self): super(SourcesApiTestCase, self).setUp() self.source = Source() self.source.foreign_id = "test" self.source.label = "Test Collection" self.source.category = "news" db.session.add(self.source) db.session.flush() permission = Permission() permission.role_id = Role.system(Role.SYSTEM_USER) permission.read = True permission.write = True permission.resource_id = self.source.id permission.resource_type = Permission.SOURCE db.session.add(permission) db.session.commit()
def convert_sources(facet): output = {'values': []} ids = [b.get('key') for b in facet.get('buckets', [])] sources = Source.all_by_ids(ids).all() for bucket in facet.get('buckets', []): key = bucket.get('key') for source in sources: if source.id != key: continue output['values'].append({ 'id': key, 'label': source.label, 'category': source.category, 'count': bucket.get('doc_count') }) return output
def sources(action): if not hasattr(request, 'auth_sources'): request.auth_sources = {READ: set(), WRITE: set()} if is_admin(): for source_id, in Source.all_ids(): request.auth_sources[READ].add(source_id) request.auth_sources[WRITE].add(source_id) else: q = Permission.all() q = q.filter(Permission.role_id.in_(request.auth_roles)) q = q.filter(Permission.resource_type == Permission.SOURCE) for perm in q: if perm.read: request.auth_sources[READ].add(perm.resource_id) if perm.write and request.logged_in: request.auth_sources[WRITE].add(perm.resource_id) return list(request.auth_sources.get(action, []))
def index(foreign_id=None, immediate=False): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: #import time; time.sleep(10) #let's not get banned print('indexing %s' % doc_id) if immediate: #bypass the queue index_document(doc_id) else: index_document.delay(doc_id) if foreign_id is None: reindex_entities()
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, { "range": { "created_at": { "gt": alert.notified_at } } }) q = { 'query': q, 'size': 150 } result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def crawl(self, directory=None, source=None): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) if os.path.isfile(directory): meta = self.metadata() meta.file_name = directory self.emit_file(source, meta, directory) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): log.info("Descending: %r", dirname) for file_name in files: if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) if not os.path.isfile(file_path): continue try: meta = self.metadata() if isinstance(file_name, six.text_type): meta.source_path = file_path else: enc = chardet.detect(file_name) enc = enc.get('encoding') try: meta.source_path = file_path.decode(enc) except: meta.source_path = file_path.decode('ascii', 'ignore') self.emit_file(source, meta, file_path) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=self.name, source_location=directory, source_id=source.id, exception=ex)
def crawl_item(self, item, source): source_data = item.meta.get('source', {}) source_fk = source_data.pop('foreign_id', source) if source_fk is None: raise ValueError("No foreign_id for source given: %r" % item) if source_fk not in self.sources: label = source_data.get('label', source_fk) self.sources[source_fk] = Source.create({ 'foreign_id': source_fk, 'label': label }) if source_data.get('public'): Permission.grant_foreign(self.sources[source_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.sources[source_fk].id, meta, item.data_path, move=False)
def crawl(self, directory=None, source=None): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) if os.path.isfile(directory): meta = self.metadata() meta.file_name = directory self.emit_file(source, meta, directory) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): log.info("Descending: %r", dirname) for file_name in files: if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) if not os.path.isfile(file_path): continue try: meta = self.metadata() if isinstance(file_name, six.text_type): meta.source_path = file_path else: enc = chardet.detect(file_name) enc = enc.get('encoding') try: meta.source_path = file_path.decode(enc) except: meta.source_path = file_path.decode('ascii', 'ignore') self.emit_file(source, meta, file_path) except Exception as ex: log.exception(ex)
def sources(): """List all sources.""" for source in Source.all(): print source.id, source.foreign_id, source.label
def index(): q = Source.all_by_ids(ids=authz.sources(authz.READ)) return jsonify(Pager(q))
def view(id): authz.require(authz.source_read(id)) source = obj_or_404(Source.by_id(id)) return jsonify(source)
def process(id): authz.require(authz.source_write(id)) source = obj_or_404(Source.by_id(id)) analyze_source.delay(source.id) return jsonify({'status': 'ok'})
def upgrade(): log.info("Beginning database migration...") migrate.upgrade() log.info("Reconfiguring the search index...") init_search() Source.sync()
def create_source(self, **data): if 'foreign_id' not in data: data['foreign_id'] = self.name return Source.create(data)
def process(slug): authz.require(authz.source_write(slug)) source = obj_or_404(Source.by_slug(slug)) process_collection.delay(source.slug) return jsonify({'status': 'ok'})
def crawl(slug): authz.require(authz.source_write(slug)) source = obj_or_404(Source.by_slug(slug)) crawl_source.delay(source.slug) return jsonify({'status': 'ok'})
def create(): authz.require(authz.logged_in()) src = Source.create(request_data(), current_user) db.session.commit() return view(src.slug)