def download_db(url: str = CONCEPTNET_DB_URL, db_path: PathOrStr = CONCEPTNET_DB_NAME, delete_compressed_db: bool = True) -> None: """Download compressed ConceptNet dump and extract it. Args: url: Link to compressed ConceptNet database. db_path: Path to resulting database. delete_compressed_db: Delete compressed database after extraction. """ print("Download compressed database") db_path = Path(db_path).expanduser().resolve() if db_path.is_dir(): db_path = _generate_db_path(db_path) if db_path.is_file(): raise FileExistsError(17, "File already exists", str(db_path)) compressed_db_path = _get_download_destination_path(db_path.parent, url) if compressed_db_path.is_file(): raise FileExistsError(17, "File already exists", str(compressed_db_path)) downloader = SmartDL(url, str(compressed_db_path)) downloader.start() try: with zipfile.ZipFile(str(compressed_db_path), 'r') as zip_f: print("Extract compressed database (this can take a few minutes)") zip_f.extractall(db_path.parent) if db_path.name != CONCEPTNET_DB_NAME: Path(db_path.parent / CONCEPTNET_DB_NAME).rename(db_path) finally: if delete_compressed_db and compressed_db_path.is_file(): compressed_db_path.unlink()
def connect( db_path: PathOrStr = CONCEPTNET_DB_NAME, db_download_url: Optional[str] = CONCEPTNET_DB_URL, delete_compressed_db: bool = True, dump_download_url: str = CONCEPTNET_DUMP_DOWNLOAD_URL, load_dump_edge_count: int = CONCEPTNET_EDGE_COUNT, delete_compressed_dump: bool = True, delete_dump: bool = True, ) -> None: """Connect to ConceptNet database. This function connects to ConceptNet database. If it does not exists, there are two options: to download ready database or to download the compressed ConceptNet dump, extract it, and load it into database (pass `db_download_url=None` for this option). Args: db_path: Path to the database. db_download_url: Link to compressed ConceptNet database. Pass `None` to build the db from dump. delete_compressed_db: Delete compressed database after extraction. dump_download_url: Link to compressed ConceptNet dump. load_dump_edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing. delete_compressed_dump: Delete compressed dump after unpacking. delete_dump: Delete dump after loading into database. """ db_path = Path(db_path).expanduser().resolve() if db_path.is_dir(): db_path = _generate_db_path(db_path) try: if db_path.is_file(): _open_db(path=db_path) else: raise FileNotFoundError(2, "No such file", str(db_path)) except FileNotFoundError: print(f"File not found: {db_path}") if db_download_url is not None: download_db( url=db_download_url, db_path=db_path, delete_compressed_db=delete_compressed_db, ) _open_db(db_path) else: prepare_db( db_path=db_path, dump_download_url=dump_download_url, load_dump_edge_count=load_dump_edge_count, delete_compressed_dump=delete_compressed_dump, delete_dump=delete_dump, )
def extract_compressed_dump( compressed_dump_path: PathOrStr, delete_compressed_dump: bool = True, ): """Extract compressed ConceptNet dump. Args: compressed_dump_path: Path to compressed dump to extract. delete_compressed_dump: Delete compressed dump after extraction. """ dump_path = Path(compressed_dump_path).with_suffix('') try: with gzip.open(str(compressed_dump_path), 'rb') as f_in: with open(str(dump_path), 'wb') as f_out: print("Extract compressed dump (this can take a few minutes)") shutil.copyfileobj(f_in, f_out) finally: if delete_compressed_dump and compressed_dump_path.is_file(): compressed_dump_path.unlink()
def prepare_db( db_path: PathOrStr, dump_download_url: str = CONCEPTNET_DUMP_DOWNLOAD_URL, load_dump_edge_count: int = CONCEPTNET_EDGE_COUNT, delete_compressed_dump: bool = True, delete_dump: bool = True, ): """Prepare ConceptNet database. This function downloads the compressed ConceptNet dump, extracts it, and loads it into database. First two steps are optional, and are executed only if needed. Args: db_path: Path to the resulting database. dump_download_url: Link to compressed ConceptNet dump. load_dump_edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing. delete_compressed_dump: Delete compressed dump after extraction. delete_dump: Delete dump after loading into database. """ db_path = Path(db_path).expanduser().resolve() if db_path.is_dir(): db_path = _generate_db_path(db_path) if db_path.is_file(): raise FileExistsError( 17, "File already exists and it is not a valid database", str(db_path)) print("Prepare database") compressed_dump_path = _get_download_destination_path( db_path.parent, CONCEPTNET_DUMP_DOWNLOAD_URL) dump_path = compressed_dump_path.with_suffix('') db_path.parent.mkdir(parents=True, exist_ok=True) load_dump_to_db_ = partial( load_dump_to_db, dump_path=dump_path, db_path=db_path, edge_count=load_dump_edge_count, delete_dump=delete_dump, ) extract_compressed_dump_ = partial( extract_compressed_dump, compressed_dump_path=compressed_dump_path, delete_compressed_dump=delete_compressed_dump, ) download_dump_ = partial( download_dump, url=dump_download_url, out_dir_path=db_path.parent, ) try: load_dump_to_db_() except FileNotFoundError: try: extract_compressed_dump_() load_dump_to_db_() except FileNotFoundError: download_dump_() extract_compressed_dump_() load_dump_to_db_() finally: if delete_compressed_dump and compressed_dump_path.is_file(): compressed_dump_path.unlink() if delete_dump and dump_path.is_file(): dump_path.unlink()
def load_dump_to_db( dump_path: PathOrStr, db_path: PathOrStr, edge_count: int = CONCEPTNET_EDGE_COUNT, delete_dump: bool = True, ): """Load dump to database. Args: dump_path: Path to dump to load. db_path: Path to resulting database. edge_count: Number of edges to load from the beginning of the dump file. Can be useful for testing. delete_dump: Delete dump after loading into database. """ def edges_from_dump_by_parts_generator( count: Optional[int] = None, ) -> Generator[Tuple[str, str, str, str], None, None]: with open(str(dump_path), newline='') as f: reader = csv.reader(f, delimiter='\t') for i, row in enumerate(reader): if i == count: break yield row[1:5] def extract_relation_name(uri: str) -> str: return _to_snake_case(uri[3:]) def get_struct_format(length: int) -> str: return f'{length}Q' def pack_ints(*ints) -> bytes: return struct.pack(get_struct_format(length=len(ints)), *ints) def unpack_ints(buffer: bytes) -> Tuple[int, ...]: return struct.unpack(get_struct_format(len(buffer) // 8), buffer) def relation_in_bytes(relation_uri: str) -> bytes: relation_name = extract_relation_name(relation_uri) return relation_name.encode('utf8') def language_and_label_in_bytes(concept_uri: str) -> Tuple[bytes, bytes]: return tuple( x.encode('utf8') for x in concept_uri.split('/', maxsplit=4)[2:4])[:2] def normalize() -> None: """Normalize dump before loading into database using lmdb.""" def normalize_relation() -> None: nonlocal relation_i relation_b = relation_in_bytes(relation_uri=relation_uri) relation_exists = txn.get(relation_b, db=relation_db) is not None if not relation_exists: relation_i += 1 relation_i_b = pack_ints(relation_i) txn.put(relation_b, relation_i_b, db=relation_db) def normalize_concept(uri: str) -> None: nonlocal language_i, label_i, concept_i language_b, label_b = language_and_label_in_bytes(concept_uri=uri) language_id_b = txn.get(language_b, db=language_db) if language_id_b is None: language_i += 1 language_id_b = pack_ints(language_i) txn.put(language_b, language_id_b, db=language_db) label_language_b = label_b + b'/' + language_b label_id_b = txn.get(label_language_b, db=label_db) if label_id_b is None: label_i += 1 label_id_b = pack_ints(label_i) txn.put(label_language_b, label_id_b, db=label_db) concept_b = uri.encode('utf8') concept_id_b = txn.get(concept_b, db=concept_db) if concept_id_b is None: concept_i += 1 concept_id_b = pack_ints(concept_i) txn.put(concept_b, concept_id_b, db=concept_db) language_i, relation_i, label_i, concept_i = 4 * [0] if not dump_path.is_file(): raise FileNotFoundError(2, 'No such file', str(dump_path)) print('Dump normalization') edges = enumerate(edges_from_dump_by_parts_generator(count=edge_count)) for i, (relation_uri, start_uri, end_uri, _) in tqdm(edges, unit=' edges', total=edge_count): normalize_relation() normalize_concept(start_uri) normalize_concept(end_uri) def insert() -> None: """Load dump from CSV and lmdb database into database.""" def insert_objects_from_edge(): nonlocal edge_i def insert_relation() -> int: nonlocal relation_i relation_b = relation_in_bytes(relation_uri=relation_uri) result_id, = unpack_ints( buffer=txn.get(relation_b, db=relation_db)) if result_id == relation_i: name = relation_b.decode('utf8') db.execute_sql('insert into relation (name) values (?)', (name, )) relation_i += 1 return result_id def insert_concept(uri: str) -> int: nonlocal language_i, label_i, concept_i split_uri = uri.split('/', maxsplit=4) language_b, label_b = language_and_label_in_bytes( concept_uri=uri) language_id, = unpack_ints( buffer=txn.get(language_b, db=language_db)) if language_id == language_i: name = split_uri[2] db.execute_sql('insert into language (name) values (?)', (name, )) language_i += 1 label_language_b = label_b + b'/' + language_b label_id, = unpack_ints( buffer=txn.get(label_language_b, db=label_db)) if label_id == label_i: text = split_uri[3] params = (text, language_id) db.execute_sql( 'insert into label (text, language_id) values (?, ?)', params) label_i += 1 concept_b = uri.encode('utf8') concept_id, = unpack_ints( buffer=txn.get(concept_b, db=concept_db)) if concept_id == concept_i: sense_label = '' if len(split_uri) == 4 else split_uri[4] params = (label_id, sense_label) db.execute_sql( 'insert into concept (label_id, sense_label) values (?, ?)', params) concept_i += 1 return concept_id def insert_edge() -> None: params = (relation_id, start_id, end_id, edge_etc) db.execute_sql( 'insert into edge (relation_id, start_id, end_id, etc) values (?, ?, ?, ?)', params) relation_id = insert_relation() start_id = insert_concept(uri=start_uri) end_id = insert_concept(uri=end_uri) insert_edge() edge_i += 1 print('Dump insertion') relation_i, language_i, label_i, concept_i, edge_i = 5 * [1] edges = edges_from_dump_by_parts_generator(count=edge_count) progress_bar = tqdm(unit=' edges', total=edge_count) finished = False while not finished: edge_count_per_insert = 1000000 with db.atomic(): for _ in range(edge_count_per_insert): try: relation_uri, start_uri, end_uri, edge_etc = next( edges) except StopIteration: finished = True break insert_objects_from_edge() progress_bar.update() GIB = 1 << 30 dump_path = Path(dump_path) lmdb_db_path = dump_path.parent / f'conceptnet-lmdb-{uuid4()}.db' env = lmdb.open(str(lmdb_db_path), map_size=4 * GIB, max_dbs=5, sync=False, writemap=False) relation_db = env.open_db(b'relation') language_db = env.open_db(b'language') label_db = env.open_db(b'label') concept_db = env.open_db(b'concept') try: with env.begin(write=True) as txn: normalize() _open_db(path=db_path) insert() finally: shutil.rmtree(str(lmdb_db_path), ignore_errors=True) if delete_dump and dump_path.is_file(): dump_path.unlink()