Example #1
0
def get_results(query, limit):
    collections = {}
    for i, row in enumerate(scan_iter(query)):
        if i >= limit:
            return
        data = {
            'file_url': url_for('documents_api.file',
                                document_id=row.get('_id'))
        }
        for name, value in row.get('_source').items():
            if name == 'collection_id':
                colls = []
                for coll in value:
                    if coll not in collections:
                        source = Collection.by_id(coll)
                        if source is None:
                            collections[coll] = '[Deleted collection %s]' % value
                        else:
                            collections[coll] = source.label
                    colls.append(collections[coll])
                value = ', '.join(sorted(colls))
                name = 'collections'
            if name not in FIELDS:
                continue
            if isinstance(value, (list, tuple, set)):
                value = ', '.join(value)
            data[name] = value
        yield data
Example #2
0
def get_results(query, limit):
    sources = {}
    for i, row in enumerate(scan_iter(query)):
        if i >= limit:
            return
        data = {
            'file_url': url_for('documents_api.file',
                                document_id=row.get('_id'))
        }
        for name, value in row.get('_source').items():
            if name == 'source_id':
                if value not in sources:
                    source = Source.by_id(value)
                    if source is None:
                        sources[value] = '[Deleted source %s]' % value
                    else:
                        sources[value] = source.label
                value = sources[value]
                name = 'source'
            if name not in FIELDS:
                continue
            if isinstance(value, (list, tuple, set)):
                value = ', '.join(value)
            data[name] = value
        yield data
Example #3
0
def get_results(query, limit):
    sources = {}
    for i, row in enumerate(scan_iter(query)):
        if i >= limit:
            return
        data = {
            'file_url': url_for('documents_api.file',
                                document_id=row.get('_id'))
        }
        for name, value in row.get('_source').items():
            if name == 'source_id':
                if value not in sources:
                    source = Source.by_id(value)
                    if source is None:
                        sources[value] = '[Deleted source %s]' % value
                    else:
                        sources[value] = source.label
                value = sources[value]
                name = 'source'
            if name not in FIELDS:
                continue
            if isinstance(value, (list, tuple, set)):
                value = ', '.join(value)
            data[name] = value
        yield data
Example #4
0
def generate_graph(args):
    fields = ['id', 'collection', 'entities.uuid', 'entities.name',
              'entities.$schema']
    query = documents_query(args, fields=fields, facets=False)
    query = {'query': query['query']}

    graph = nx.MultiGraph()
    for doc in scan_iter(query):
        entities = set()
        for entity in doc.get('_source').get('entities', []):
            if not graph.has_node(entity.get('uuid')):
                graph.add_node(entity.get('uuid'),
                               label=entity.get('name'),
                               schema=entity.get('$schema'))
            entities.add(entity.get('uuid'))
        for (src, dst) in combinations(entities, 2):
            graph.add_edge(src, dst, weight=1)
    graph = multigraph_to_weighted(graph)
    return paginate_graph(graph)
Example #5
0
def generate_graph(args):
    fields = [
        'id', 'collection', 'entities.uuid', 'entities.name',
        'entities.$schema'
    ]
    query = documents_query(args, fields=fields, facets=False)
    query = {'query': query['query']}

    graph = nx.MultiGraph()
    for doc in scan_iter(query):
        entities = set()
        for entity in doc.get('_source').get('entities', []):
            if not graph.has_node(entity.get('uuid')):
                graph.add_node(entity.get('uuid'),
                               label=entity.get('name'),
                               schema=entity.get('$schema'))
            entities.add(entity.get('uuid'))
        for (src, dst) in combinations(entities, 2):
            graph.add_edge(src, dst, weight=1)
    graph = multigraph_to_weighted(graph)
    return paginate_graph(graph)
Example #6
0
def analyze_source(source_id):
    query = {'term': {'source_id': source_id}}
    query = {'query': query, '_source': False}
    for row in scan_iter(query):
        analyze_document.delay(row.get('_id'))
Example #7
0
def analyze_collection(collection_id):
    query = {'term': {'collection_id': collection_id}}
    query = {'query': query, '_source': False}
    for row in scan_iter(query):
        analyze_document.delay(row.get('_id'))
Example #8
0
def analyze_documents(collection_id):
    query = {'term': {'collection_id': collection_id}}
    query = {'query': query, '_source': False}
    for row in scan_iter(query, TYPE_DOCUMENT):
        analyze_document_id.delay(row.get('_id'))
Example #9
0
def query_doc_ids(query):
    query = {'query': query, '_source': False}
    for row in scan_iter(query):
        yield row.get('_id')
Example #10
0
def analyze_collection(collection_id):
    query = {'term': {'collection_id': collection_id}}
    query = {'query': query, '_source': False}
    for row in scan_iter(query):
        analyze_document_id.delay(row.get('_id'))
Example #11
0
def query_doc_ids(query):
    query = {'query': query, '_source': False}
    for row in scan_iter(query):
        yield row.get('_id')
Example #12
0
def analyze_collection(collection_id):
    query = {"term": {"collection_id": collection_id}}
    query = {"query": query, "_source": False}
    for row in scan_iter(query):
        analyze_document_id.delay(row.get("_id"))