Ejemplo n.º 1
0
def download_db(url: str = CONCEPTNET_DB_URL,
                db_path: PathOrStr = CONCEPTNET_DB_NAME,
                delete_compressed_db: bool = True) -> None:
    """Download compressed ConceptNet dump and extract it.

    Args:
        url: Link to compressed ConceptNet database.
        db_path: Path to resulting database.
        delete_compressed_db: Delete compressed database after extraction.
    """

    print("Download compressed database")
    db_path = Path(db_path).expanduser().resolve()
    if db_path.is_dir():
        db_path = _generate_db_path(db_path)
        if db_path.is_file():
            raise FileExistsError(17, "File already exists", str(db_path))
    compressed_db_path = _get_download_destination_path(db_path.parent, url)
    if compressed_db_path.is_file():
        raise FileExistsError(17, "File already exists",
                              str(compressed_db_path))
    downloader = SmartDL(url, str(compressed_db_path))
    downloader.start()
    try:
        with zipfile.ZipFile(str(compressed_db_path), 'r') as zip_f:
            print("Extract compressed database (this can take a few minutes)")
            zip_f.extractall(db_path.parent)
        if db_path.name != CONCEPTNET_DB_NAME:
            Path(db_path.parent / CONCEPTNET_DB_NAME).rename(db_path)
    finally:
        if delete_compressed_db and compressed_db_path.is_file():
            compressed_db_path.unlink()
Ejemplo n.º 2
0
def connect(
    db_path: PathOrStr = CONCEPTNET_DB_NAME,
    db_download_url: Optional[str] = CONCEPTNET_DB_URL,
    delete_compressed_db: bool = True,
    dump_download_url: str = CONCEPTNET_DUMP_DOWNLOAD_URL,
    load_dump_edge_count: int = CONCEPTNET_EDGE_COUNT,
    delete_compressed_dump: bool = True,
    delete_dump: bool = True,
) -> None:
    """Connect to ConceptNet database.

    This function connects to ConceptNet database. If it does not exists, there are two options: to download ready
    database or to download the compressed ConceptNet dump, extract it, and load it
    into database (pass `db_download_url=None` for this option).

    Args:
        db_path: Path to the database.
        db_download_url: Link to compressed ConceptNet database. Pass `None` to build the db from dump.
        delete_compressed_db: Delete compressed database after extraction.
        dump_download_url: Link to compressed ConceptNet dump.
        load_dump_edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing.
        delete_compressed_dump: Delete compressed dump after unpacking.
        delete_dump: Delete dump after loading into database.
    """
    db_path = Path(db_path).expanduser().resolve()
    if db_path.is_dir():
        db_path = _generate_db_path(db_path)
    try:
        if db_path.is_file():
            _open_db(path=db_path)
        else:
            raise FileNotFoundError(2, "No such file", str(db_path))
    except FileNotFoundError:
        print(f"File not found: {db_path}")
        if db_download_url is not None:
            download_db(
                url=db_download_url,
                db_path=db_path,
                delete_compressed_db=delete_compressed_db,
            )
            _open_db(db_path)
        else:
            prepare_db(
                db_path=db_path,
                dump_download_url=dump_download_url,
                load_dump_edge_count=load_dump_edge_count,
                delete_compressed_dump=delete_compressed_dump,
                delete_dump=delete_dump,
            )
Ejemplo n.º 3
0
def extract_compressed_dump(
    compressed_dump_path: PathOrStr,
    delete_compressed_dump: bool = True,
):
    """Extract compressed ConceptNet dump.

    Args:
          compressed_dump_path: Path to compressed dump to extract.
          delete_compressed_dump: Delete compressed dump after extraction.
    """

    dump_path = Path(compressed_dump_path).with_suffix('')
    try:
        with gzip.open(str(compressed_dump_path), 'rb') as f_in:
            with open(str(dump_path), 'wb') as f_out:
                print("Extract compressed dump (this can take a few minutes)")
                shutil.copyfileobj(f_in, f_out)
    finally:
        if delete_compressed_dump and compressed_dump_path.is_file():
            compressed_dump_path.unlink()
Ejemplo n.º 4
0
def prepare_db(
    db_path: PathOrStr,
    dump_download_url: str = CONCEPTNET_DUMP_DOWNLOAD_URL,
    load_dump_edge_count: int = CONCEPTNET_EDGE_COUNT,
    delete_compressed_dump: bool = True,
    delete_dump: bool = True,
):
    """Prepare ConceptNet database.

    This function downloads the compressed ConceptNet dump, extracts it, and loads it into database. First two steps
    are optional, and are executed only if needed.

    Args:
        db_path: Path to the resulting database.
        dump_download_url: Link to compressed ConceptNet dump.
        load_dump_edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing.
        delete_compressed_dump: Delete compressed dump after extraction.
        delete_dump: Delete dump after loading into database.
    """

    db_path = Path(db_path).expanduser().resolve()
    if db_path.is_dir():
        db_path = _generate_db_path(db_path)
        if db_path.is_file():
            raise FileExistsError(
                17, "File already exists and it is not a valid database",
                str(db_path))

    print("Prepare database")
    compressed_dump_path = _get_download_destination_path(
        db_path.parent, CONCEPTNET_DUMP_DOWNLOAD_URL)
    dump_path = compressed_dump_path.with_suffix('')

    db_path.parent.mkdir(parents=True, exist_ok=True)

    load_dump_to_db_ = partial(
        load_dump_to_db,
        dump_path=dump_path,
        db_path=db_path,
        edge_count=load_dump_edge_count,
        delete_dump=delete_dump,
    )
    extract_compressed_dump_ = partial(
        extract_compressed_dump,
        compressed_dump_path=compressed_dump_path,
        delete_compressed_dump=delete_compressed_dump,
    )
    download_dump_ = partial(
        download_dump,
        url=dump_download_url,
        out_dir_path=db_path.parent,
    )

    try:
        load_dump_to_db_()
    except FileNotFoundError:
        try:
            extract_compressed_dump_()
            load_dump_to_db_()
        except FileNotFoundError:
            download_dump_()
            extract_compressed_dump_()
            load_dump_to_db_()
    finally:
        if delete_compressed_dump and compressed_dump_path.is_file():
            compressed_dump_path.unlink()
        if delete_dump and dump_path.is_file():
            dump_path.unlink()
Ejemplo n.º 5
0
def load_dump_to_db(
    dump_path: PathOrStr,
    db_path: PathOrStr,
    edge_count: int = CONCEPTNET_EDGE_COUNT,
    delete_dump: bool = True,
):
    """Load dump to database.

    Args:
          dump_path: Path to dump to load.
          db_path: Path to resulting database.
          edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing.
          delete_dump: Delete dump after loading into database.
    """
    def edges_from_dump_by_parts_generator(
        count: Optional[int] = None,
    ) -> Generator[Tuple[str, str, str, str], None, None]:
        with open(str(dump_path), newline='') as f:
            reader = csv.reader(f, delimiter='\t')
            for i, row in enumerate(reader):
                if i == count:
                    break
                yield row[1:5]

    def extract_relation_name(uri: str) -> str:
        return _to_snake_case(uri[3:])

    def get_struct_format(length: int) -> str:
        return f'{length}Q'

    def pack_ints(*ints) -> bytes:
        return struct.pack(get_struct_format(length=len(ints)), *ints)

    def unpack_ints(buffer: bytes) -> Tuple[int, ...]:
        return struct.unpack(get_struct_format(len(buffer) // 8), buffer)

    def relation_in_bytes(relation_uri: str) -> bytes:
        relation_name = extract_relation_name(relation_uri)
        return relation_name.encode('utf8')

    def language_and_label_in_bytes(concept_uri: str) -> Tuple[bytes, bytes]:
        return tuple(
            x.encode('utf8')
            for x in concept_uri.split('/', maxsplit=4)[2:4])[:2]

    def normalize() -> None:
        """Normalize dump before loading into database using lmdb."""
        def normalize_relation() -> None:
            nonlocal relation_i

            relation_b = relation_in_bytes(relation_uri=relation_uri)
            relation_exists = txn.get(relation_b, db=relation_db) is not None
            if not relation_exists:
                relation_i += 1
                relation_i_b = pack_ints(relation_i)
                txn.put(relation_b, relation_i_b, db=relation_db)

        def normalize_concept(uri: str) -> None:
            nonlocal language_i, label_i, concept_i

            language_b, label_b = language_and_label_in_bytes(concept_uri=uri)

            language_id_b = txn.get(language_b, db=language_db)
            if language_id_b is None:
                language_i += 1
                language_id_b = pack_ints(language_i)
                txn.put(language_b, language_id_b, db=language_db)

            label_language_b = label_b + b'/' + language_b
            label_id_b = txn.get(label_language_b, db=label_db)
            if label_id_b is None:
                label_i += 1
                label_id_b = pack_ints(label_i)
                txn.put(label_language_b, label_id_b, db=label_db)

            concept_b = uri.encode('utf8')
            concept_id_b = txn.get(concept_b, db=concept_db)
            if concept_id_b is None:
                concept_i += 1
                concept_id_b = pack_ints(concept_i)
                txn.put(concept_b, concept_id_b, db=concept_db)

        language_i, relation_i, label_i, concept_i = 4 * [0]
        if not dump_path.is_file():
            raise FileNotFoundError(2, 'No such file', str(dump_path))
        print('Dump normalization')
        edges = enumerate(edges_from_dump_by_parts_generator(count=edge_count))
        for i, (relation_uri, start_uri, end_uri, _) in tqdm(edges,
                                                             unit=' edges',
                                                             total=edge_count):
            normalize_relation()
            normalize_concept(start_uri)
            normalize_concept(end_uri)

    def insert() -> None:
        """Load dump from CSV and lmdb database into database."""
        def insert_objects_from_edge():
            nonlocal edge_i

            def insert_relation() -> int:
                nonlocal relation_i

                relation_b = relation_in_bytes(relation_uri=relation_uri)
                result_id, = unpack_ints(
                    buffer=txn.get(relation_b, db=relation_db))
                if result_id == relation_i:
                    name = relation_b.decode('utf8')
                    db.execute_sql('insert into relation (name) values (?)',
                                   (name, ))
                    relation_i += 1
                return result_id

            def insert_concept(uri: str) -> int:
                nonlocal language_i, label_i, concept_i

                split_uri = uri.split('/', maxsplit=4)

                language_b, label_b = language_and_label_in_bytes(
                    concept_uri=uri)

                language_id, = unpack_ints(
                    buffer=txn.get(language_b, db=language_db))
                if language_id == language_i:
                    name = split_uri[2]
                    db.execute_sql('insert into language (name) values (?)',
                                   (name, ))
                    language_i += 1

                label_language_b = label_b + b'/' + language_b
                label_id, = unpack_ints(
                    buffer=txn.get(label_language_b, db=label_db))
                if label_id == label_i:
                    text = split_uri[3]
                    params = (text, language_id)
                    db.execute_sql(
                        'insert into label (text, language_id) values (?, ?)',
                        params)
                    label_i += 1

                concept_b = uri.encode('utf8')
                concept_id, = unpack_ints(
                    buffer=txn.get(concept_b, db=concept_db))
                if concept_id == concept_i:
                    sense_label = '' if len(split_uri) == 4 else split_uri[4]
                    params = (label_id, sense_label)
                    db.execute_sql(
                        'insert into concept (label_id, sense_label) values (?, ?)',
                        params)
                    concept_i += 1
                return concept_id

            def insert_edge() -> None:
                params = (relation_id, start_id, end_id, edge_etc)
                db.execute_sql(
                    'insert into edge (relation_id, start_id, end_id, etc) values (?, ?, ?, ?)',
                    params)

            relation_id = insert_relation()
            start_id = insert_concept(uri=start_uri)
            end_id = insert_concept(uri=end_uri)
            insert_edge()
            edge_i += 1

        print('Dump insertion')
        relation_i, language_i, label_i, concept_i, edge_i = 5 * [1]
        edges = edges_from_dump_by_parts_generator(count=edge_count)
        progress_bar = tqdm(unit=' edges', total=edge_count)
        finished = False
        while not finished:
            edge_count_per_insert = 1000000
            with db.atomic():
                for _ in range(edge_count_per_insert):
                    try:
                        relation_uri, start_uri, end_uri, edge_etc = next(
                            edges)
                    except StopIteration:
                        finished = True
                        break
                    insert_objects_from_edge()
                    progress_bar.update()

    GIB = 1 << 30
    dump_path = Path(dump_path)
    lmdb_db_path = dump_path.parent / f'conceptnet-lmdb-{uuid4()}.db'
    env = lmdb.open(str(lmdb_db_path),
                    map_size=4 * GIB,
                    max_dbs=5,
                    sync=False,
                    writemap=False)
    relation_db = env.open_db(b'relation')
    language_db = env.open_db(b'language')
    label_db = env.open_db(b'label')
    concept_db = env.open_db(b'concept')
    try:
        with env.begin(write=True) as txn:
            normalize()
            _open_db(path=db_path)
            insert()
    finally:
        shutil.rmtree(str(lmdb_db_path), ignore_errors=True)
        if delete_dump and dump_path.is_file():
            dump_path.unlink()