Beispiel #1
0
def crawl_dir(api: AlephAPI, path: str, foreign_id: str, config: Dict):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    _path = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    collection_id = collection.get('id')
    _queue: Queue = Queue()
    _queue.put((_path, None, 1))
    threads = []
    for i in range(settings.THREADS):
        args = (_queue, api, collection_id, _path)
        thread = threading.Thread(target=_upload, args=args)
        thread.daemon = True
        thread.start()
        threads.append(thread)

    # block until all tasks are done
    _queue.join()
    for thread in threads:
        thread.join()
Beispiel #2
0
def crawl_dir(api: AlephAPI,
              path: str,
              foreign_id: str,
              config: Dict,
              index: bool = True):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    root = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    crawler = CrawlDirectory(api, collection, root, index=index)
    threads = []
    for i in range(settings.THREADS):
        thread = threading.Thread(target=crawler.execute)
        thread.daemon = True
        thread.start()
        threads.append(thread)

    # block until all tasks are done
    crawler.queue.join()
    for thread in threads:
        thread.join()
Beispiel #3
0
def bulk_load(api: AlephAPI, mapping_file: str):
    data = load_config_file(mapping_file)
    if not isinstance(data, dict):
        raise AlephException('mapping_file has to be a json dictionary')
    for foreign_id, config in data.items():
        collection = api.load_collection_by_foreign_id(foreign_id, config)
        collection_id = collection['id']
        log.info(f"Bulk mapping collection ID: {collection_id}")
        api.map_collection(collection_id, data)
Beispiel #4
0
def crawl_dir(
    api: AlephAPI,
    path: str,
    foreign_id: str,
    config: Dict,
    index: bool = True,
    nojunk: bool = False,
    parallel: int = 1,
):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    root = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    crawler = CrawlDirectory(api, collection, root, index=index, nojunk=nojunk)
    consumers = []

    # Use one thread to produce using scandir and at least one to consume
    # files for upload.
    producer = threading.Thread(target=crawler.crawl, daemon=True)
    producer.start()
    for i in range(max(1, parallel)):
        consumer = threading.Thread(target=crawler.consume, daemon=True)
        consumer.start()
        consumers.append(consumer)

    # Block until the producer is done with queueing the tree.
    producer.join()

    # Block until the file upload queue is drained.
    crawler.queue.join()

    # Poison the queue to signal end to each consumer.
    for consumer in consumers:
        crawler.queue.put((None, None))

    # Block until all file upload queue consumers are done.
    for consumer in consumers:
        consumer.join()
def load_entities(json_file, root_path):
    api = AlephAPI()
    collection = api.load_collection_by_foreign_id('zz_occrp_pdi')
    cid = collection.get('id')
    api.write_entities(cid, generate_entities(json_file, root_path, api, cid))