class MigrateDB(OfflineTools): def __init__(self, local_configure): super().__init__() self.DBIPFrom = local_configure["DBIPFrom"] self.DBIPTo = local_configure["DBIPTo"] self.DBPortFrom = local_configure["DBPortFrom"] self.DBPortTo = local_configure["DBPortTo"] self.DBFrom = local_configure["DBFrom"] self.DBTo = local_configure["DBTo"] self.MongoFrom = MongoDB(db_server_ip=self.DBIPFrom, db_server_port=self.DBPortFrom, database_name=self.DBFrom) self.MongoTo = MongoDB(db_server_ip=self.DBIPTo, db_server_port=self.DBPortTo, database_name=self.DBTo) def execute(self): print("From Relevant docs count:", self.MongoFrom.db["RelevantType"].count()) print("To Relevant docs count:", self.MongoTo.db["RelevantType"].count()) print("Begin migrating DB ...") documents = list(self.MongoFrom.db["RelevantType"].find()) result1 = self.MongoTo.save_to_mongodb_many("RelevantType", documents) print("Relevant Finished!", self.MongoTo.db["RelevantType"].count()) print("From Relevant docs count:", self.MongoFrom.db["FutureUseType"].count()) print("To Relevant docs count:", self.MongoTo.db["FutureUseType"].count()) print("Begin migrating DB ...") documents = list(self.MongoFrom.db["FutureUseType"].find()) result2 = self.MongoTo.save_to_mongodb_many("FutureUseType", documents) print("FutureUse Finished!", self.MongoTo.db["FutureUseType"].count())
def __init__(self, local_configure): super().__init__() self.seed_entity_id = local_configure["seed_entity_id"] self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47") self.internal_use = local_configure["internal_use"] self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use) self.save_root = local_configure["save_root"] self.entityType = EntityType(local_configure["EntityType"])
def __init__(self, local_configure): super().__init__() self.internal_use = local_configure["internal_use"] self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use) self.save_root = local_configure["save_root"] self.process_number_max = local_configure["process_number_max"] self.batch_size = local_configure["batch_size"] self.debug = local_configure["debug"] if self.debug: self.process_number_max = 5 self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47") else: self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.22")
def __init__(self, local_configure, global_configure): super().__init__() self.global_configure = global_configure self.local_configure = local_configure self.db_name = local_configure["db_name"] self.collection_name = local_configure["collection_name"] self.data_dir = local_configure["data_dir"] self.db_interface = MongoDB(db_server_ip="10.93.128.143", db_server_port=27017, database_name=self.db_name) self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U) self.system_logger = logging.getLogger("system_log") pass
def __init__(self, local_configure): super().__init__() self.DBIPFrom = local_configure["DBIPFrom"] self.DBIPTo = local_configure["DBIPTo"] self.DBPortFrom = local_configure["DBPortFrom"] self.DBPortTo = local_configure["DBPortTo"] self.DBFrom = local_configure["DBFrom"] self.DBTo = local_configure["DBTo"] self.MongoFrom = MongoDB(db_server_ip=self.DBIPFrom, db_server_port=self.DBPortFrom, database_name=self.DBFrom) self.MongoTo = MongoDB(db_server_ip=self.DBIPTo, db_server_port=self.DBPortTo, database_name=self.DBTo)
def __init__(self, search_warehouse_configure): self.name = search_warehouse_configure.name self.host = search_warehouse_configure.host self.port = search_warehouse_configure.port self.db_name = search_warehouse_configure.db_name self.user = search_warehouse_configure.user self.pwd = search_warehouse_configure.pwd self.sentence_collection_name = search_warehouse_configure.sentence_collection_name self.index_dir = search_warehouse_configure.index_dir self.memcache_ip_port = search_warehouse_configure.memcache_ip_port self.db_client = MongoDB(self.host, self.port, self.db_name, self.user, self.pwd) self.sentence_collection = self.db_client.db[ self.sentence_collection_name] tmp_files = get_files(self.index_dir, r'.*bin') if len(tmp_files) == 1: self.index_template = tmp_files[0].replace(".bin", "") else: files_date = {} for filename in tmp_files: files_date[filename] = int(os.path.basename(filename)[:8]) sorted_filenames = sorted(files_date.items(), key=lambda x: x[1], reverse=True) self.index_template = sorted_filenames[0][0].replace(".bin", "") index2pos, bin_handle, word2sentence_tf = load_index( self.index_template) self.word_index_to_position_in_file = index2pos self.index_bin_file_handle = bin_handle self.word_index_to_sentence_tf = word2sentence_tf
def process_chunk(db_configure, query, ids): try: # MongoDB is not fork-safe: # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe wrap_db = MongoDB(db_configure["host"], int(db_configure["port"]), db_configure["db_name"], db_configure['user'], db_configure['pwd']) input_collection = wrap_db.db[db_configure["input_collection_name"]] df_dict = defaultdict(int) query['_id'] = {'$in': ids} response = input_collection.find(query) for data in response: word_exist = set() nlu = process_one(data) json = nlu.to_json() for i in json: ins = json[i] for item in ins['分词词性']: elems = item.split("/") word = elems[0].strip() if word == "" or elems[-1] == "True" or elems[1] == "wp": continue word_exist.add(word) for word in word_exist: df_dict[word] += 1 return df_dict except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s' % (exc_type, exc_obj, exc_tb.tb_lineno) print(msg)
def __init__(self, local_configure): super().__init__() self.system_logger = logging.getLogger("system_log") self.nworkers = int(local_configure['nworkers']) self.db_config = { 'host': local_configure['host'], 'port': int(local_configure['port']), 'user': local_configure['user'], 'pwd': local_configure['pwd'], 'input_db_name': local_configure['input_db_name'], 'sentence_collection_name': local_configure['sentence_collection_name'] } self.wrap_processed_db = MongoDB(self.db_config["host"], self.db_config["port"], self.db_config['input_db_name'], self.db_config['user'], self.db_config['pwd']) self.sentence_collection = self.wrap_processed_db.db[ self.db_config['sentence_collection_name']] self.output_file = local_configure['output_file'] pass
def __init__(self, local_configure): super().__init__() self.local_configure = local_configure self.output_file = local_configure["output_file"] self.nworkers = int(local_configure["nworkers"]) self.db_config = { 'host': local_configure['host'], 'port': int(local_configure['port']), 'db_name': local_configure['db_name'], 'input_collection_name': local_configure['input_collection_name'], 'user': local_configure['user'], 'pwd': local_configure['pwd'], } self.wrap_db = MongoDB(self.db_config['host'], int(self.db_config['port']), self.db_config['db_name'], self.db_config['user'], self.db_config['pwd']) self.input_collection = self.wrap_db.db[ self.db_config['input_collection_name']] self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U) self.non_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])") self.exclude_category = ["kuakua", "kuaixun"] self.system_logger = logging.getLogger("system_log") pass
def process(self): self.system_logger.info('initializing...') os.makedirs(self.output_dir, exist_ok=True) visualizer.copy_config_files(self.output_dir) visualizer.KBID_PREFIX['Tencent_KG'] = self.kg_api_url self.system_logger.info('searching...') input_db = MongoDB(self.db_config['host'], self.db_config['port'], self.db_config['input_db_name'], self.db_config['user'], self.db_config['pwd']) input_collection = input_db.db[self.db_config['input_collection_name']] query = {"import_date": {"$gt": self.import_date}} try: docids = input_collection.find(query).distinct('docid') except pymongo.errors.OperationFailure: # TO-DO: fix the following error: # pymongo.errors.OperationFailure: distinct too big, 16mb cap docids = [i['docid'] for i in input_collection.find(query)] chunk_size = int(len(docids) / self.nworkers) self.system_logger.info('# of docs found: %s' % len(docids)) if len(docids) == 0: return if chunk_size == 0: chunk_size = len(docids) self.system_logger.info('# of workers: %s' % self.nworkers) self.system_logger.info('chunk size: %s' % chunk_size) chunks = [] for i in range(0, len(docids), chunk_size): chunks.append(slice(i, i + chunk_size)) self.system_logger.info('parent pid: %s' % os.getpid()) self.system_logger.info('processing...') # # Single processing # for c in chunks: # process_chunk(self.db_config, docids[c], self.output_dir) # Multi-processing pool = multiprocessing.Pool(processes=self.nworkers) for c in chunks: args = ( self.db_config, docids[c], self.output_dir, ) pool.apply_async( process_chunk, args=args, ) pool.close() pool.join()
def get_names(self, props): ''' Extract name list from Tencent KG ''' self.system_logger.info('host: %s' % self.db_config['host']) self.system_logger.info('port: %s' % self.db_config['port']) self.system_logger.info('db name: %s' % self.db_config['db_name']) self.system_logger.info('collection name: %s' % self.db_config['input_collection_name']) client = MongoDB(self.db_config['host'], self.db_config['port'], self.db_config['db_name'], self.db_config['user'], self.db_config['pwd']) collection = client.db[self.db_config['input_collection_name']] kbid2names = defaultdict(lambda: defaultdict(int)) kbid2types = {} kbid2hypernyms = {} kbid2popularity = {} count = defaultdict(int) res = collection.find({}) self.system_logger.info('# of entries found: %s' % res.count()) for i in res: for p in props: try: # if not set([x[0] for x in i['types']]).intersection(TYPES): # continue kbid2types[i['_id']] = [x[0] for x in i['types']] except KeyError: count['missing_type'] += 1 try: for name in i[p]: kbid2names[i['_id']][name[0]] += 1 except KeyError: count['missing_%s' % p] += 1 try: kbid2hypernyms[i['_id']] = [x[0] for x in i['精选上位词']] except KeyError: count['missing_hypernyms'] += 1 try: kbid2popularity[i['_id']] = int(i['popular'][0][0]) except KeyError: count['missing_popularity'] += 1 self.system_logger.info('Missing properties:') for i in count: self.system_logger.info(' %s: %s' % (i, count[i])) return kbid2names, kbid2types, kbid2hypernyms, kbid2popularity
def __init__(self, local_configure): self.host = local_configure['db_host'] self.port = int(local_configure['db_port']) self.db_name = local_configure['db_name'] self.user = local_configure['user'] self.pwd = local_configure['pwd'] self.kg_collection_name = local_configure['kg_collection_name'] self.linker_collection_name = local_configure['linker_collection_name'] self.client = MongoDB(self.host, self.port, self.db_name, self.user, self.pwd) self.collection_kg = self.client.db[self.kg_collection_name] self.collection_mentions = self.client.db[self.linker_collection_name]
def __init__(self, local_configure): super().__init__() self.data_dir = local_configure['data_dir'] self.output_path = local_configure['output_path'] self.keywords = local_configure['keywords'] self.mongo = MongoDB(local_configure['host'], int(local_configure['port']), local_configure['db_name'], local_configure['user'], local_configure['pwd']) self.collection = self.mongo.db[local_configure['collection_name']] self.system_logger = logging.getLogger("system_log") pass
def __init__(self, entity_warehouse_configure): self.name = entity_warehouse_configure.name self.host = entity_warehouse_configure.host self.port = entity_warehouse_configure.port self.db_name = entity_warehouse_configure.db_name self.user = entity_warehouse_configure.user self.pwd = entity_warehouse_configure.pwd self.collection_name = entity_warehouse_configure.entity_collection_name self.mentions_name = entity_warehouse_configure.entity_mentions_name self.memcache_ip_port = entity_warehouse_configure.memcache_ip_port self.db_client = MongoDB(self.host, self.port, self.db_name, self.user, self.pwd) self.collection = self.db_client.db[self.collection_name] self.mentions = self.db_client.db[self.mentions_name]
def process_chunk(db_config, docids, output_dir): try: # MongoDB is not fork-safe: # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe input_db = MongoDB(db_config['host'], db_config['port'], db_config['input_db_name'], db_config['user'], db_config['pwd']) input_collection = input_db.db[db_config['input_collection_name']] for docid in docids: html = [] response = input_collection.find({'docid': docid}) for sent in response: # text = ''.join([x.split('/')[0] for x in sent['分词词性']]) text = sent['raw_sentence'] entitymentions = [] if sent['实体链接']: for i in sent['实体链接']: if i['entity']: en = Entity(i['entity']['kbid']) else: en = None em = EntityMention( i['entity_mention'], beg=i['beg'] - sent['sentence_start'], end=i['end'] - sent['sentence_start'], entity=en) entitymentions.append(em) h = visualizer.visualize(text, entitymentions, stats=True) html.append(h) html = visualizer.HTML_TEMPLATE % '<br>\n'.join(html) outpath = '%s/%s.html' % (output_dir, docid) with open(outpath, 'w') as fw: fw.write(html) except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s' % \ (exc_type, exc_obj, exc_tb.tb_lineno) print(msg)
def process_chunk(db_config, ids): try: wrap_processed_db = MongoDB(db_config["host"], db_config["port"], db_config['input_db_name'], db_config['user'], db_config['pwd']) sentence_collection = wrap_processed_db.db[ db_config['sentence_collection_name']] sentences = sentence_collection.find({"_id": {"$in": ids}}) entity2sentence_candidates = defaultdict() for one_sentence in sentences: sentence_entity = {} for entity_mention in one_sentence["实体链接"]: entity_text = entity_mention["entity_mention"] entity_id = None if "entity" in entity_mention and entity_mention[ "entity"] is not None: entity_id = entity_mention["entity"]["kbid"] if entity_id is None: continue sentence_entity[entity_id] = entity_text for id, text in sentence_entity.items(): entity_key = id + "_" + text if entity_key not in entity2sentence_candidates: entity2sentence_candidates[entity_key] = [] entity2sentence_candidates[entity_key].append( one_sentence["_id"]) return entity2sentence_candidates except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s' % \ (exc_type, exc_obj, exc_tb.tb_lineno) print(msg)
# assert planglinks # logger.info('loading langlinks...') # tmp = json.load(open(planglinks)) # for i in tmp: # if i['title'] in langlinks: # count['duplicate'] += 1 # continue # langlinks[i['title']] = (i['title_ll'], i['id_ll']) # logger.warning('# of duplicate langlinks: %s' % (count['duplicate'])) # logger.info('done.') # del tmp logger.info('db name: %s' % db_name) logger.info('collection name: %s' % collection_name) logger.info('drop collection') client = MongoDB(host, port, db_name, user, pwd) client.db.drop_collection(collection_name) logger.info('importing...') pool = multiprocessing.Pool(processes=nworker) logger.info('# of workers: %s' % nworker) for i in sorted(os.listdir(indir), key=lambda x: os.path.getsize('%s/%s' % (indir, x)), reverse=True): inpath = '%s/%s' % (indir, i) pool.apply_async( import_sents, args=(inpath, i), ) pool.close() pool.join()
class BFSKg(OfflineTools): def __init__(self, local_configure): super().__init__() self.seed_entity_id = local_configure["seed_entity_id"] self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47") self.internal_use = local_configure["internal_use"] self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use) self.save_root = local_configure["save_root"] self.entityType = EntityType(local_configure["EntityType"]) def execute(self): time_start = datetime.datetime.now() to_visit_save_file = self.save_root + "to_visit_save_file.npy" visited_save_file = self.save_root + "visited_save_file.npy" NoID_save_file = self.save_root + "NoID_save_file.npy" # If output_filename (.npy file only) already exists, continue to work on existing database. Otherwise, create a new database. NoID = [] entity_id_visited = [] entity_id_to_visit = [self.seed_entity_id] if os.path.isfile(visited_save_file): entity_id_visited = list( np.load(visited_save_file, allow_pickle=True)) if os.path.isfile(to_visit_save_file): entity_id_to_visit = list( np.load(to_visit_save_file, allow_pickle=True)) if os.path.isfile(NoID_save_file): NoID = list(np.load(NoID_save_file, allow_pickle=True)) # for document each epoch time_old = datetime.datetime.now() # print(entity_id_to_visit) while len(entity_id_to_visit) != 0: # Dequeue the oldest entity in the queue and set it to visited entity_id_current = entity_id_to_visit.pop(0) # print(entity_id_visited) # Judge if current entity has already been visited if entity_id_current not in entity_id_visited: data = self.kg_api.retrieve_entity(entity_id_current, False) # Judge if the new entity belongs to desinated type. If not, omit it. # if type_number not in str(data["type"]): # continue # data["popular"] = entity_id_current["popular"] # 0. set entity "_id" data["_id"] = data.pop("__id", None) if data["_id"] == None: if data != defaultdict(list, {'_id': None}): NoID.append(data) continue else: data["_id"] = data["_id"][0] # 1. Save entity into corresponding database NeedSaveFlag = True for i in data.get("types", []): if self.entityType.isDesiredType(i): insert_id = self.MongoDB_obj.save_to_mongodb( "RelevantType", data) NeedSaveFlag = False break if NeedSaveFlag: insert_id = self.MongoDB_obj.save_to_mongodb( "FutureUseType", data) NeedSaveFlag = False entity_id_visited.append(entity_id_current) # 2. explore children of current entity children = data.get("相关实体", []) # children_expand = [] # for child in children: # children_expand.extend([i["__id"] for i in self.kg_api.retrieve_relevant_entities2(child)["relevant_entity_list"] if i["__id"] not in children_expand and i["__id"] not in entity_id_visited and i["__id"] not in entity_id_to_visit]) # 3. Enqueue unvisited children entity_id_to_visit.extend([ child for child in children if child not in entity_id_visited and child not in entity_id_to_visit ]) # entity_id_to_visit.extend(children_expand) if len(entity_id_visited) % 20 == 0: print("*************************************") print(len(entity_id_to_visit)) print(len(entity_id_visited)) if os.path.isfile(to_visit_save_file): os.rename( to_visit_save_file, to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:]) if os.path.isfile(visited_save_file): os.rename( visited_save_file, visited_save_file[:-4] + "_OLD" + visited_save_file[-4:]) if os.path.isfile(NoID_save_file): os.rename( NoID_save_file, NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:]) np.save(to_visit_save_file, entity_id_to_visit) np.save(visited_save_file, entity_id_visited) np.save(NoID_save_file, NoID) time_new = datetime.datetime.now() print(time_new - time_start, time_new - time_old) time_old = time_new print("*************************************") print(len(entity_id_to_visit)) print(len(entity_id_visited)) if os.path.isfile(to_visit_save_file): os.rename( to_visit_save_file, to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:]) if os.path.isfile(visited_save_file): os.rename(visited_save_file, visited_save_file[:-4] + "_OLD" + visited_save_file[-4:]) if os.path.isfile(NoID_save_file): os.rename(NoID_save_file, NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:]) np.save(to_visit_save_file, entity_id_to_visit) np.save(visited_save_file, entity_id_visited) np.save(NoID_save_file, NoID) time_new = datetime.datetime.now() print(time_new - time_start, time_new - time_old)
class BFSMultiprocessing(OfflineTools): def __init__(self, local_configure): super().__init__() self.seed_entity_id = local_configure["seed_entity_id"] self.internal_use = local_configure["internal_use"] self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use) self.save_root = local_configure["save_root"] self.entityType = EntityType(local_configure["EntityType"]) self.process_number_max = local_configure["process_number_max"] self.batch_size = local_configure["batch_size"] self.debug = local_configure["debug"] if self.debug: self.process_number_max = 5 self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47") else: self.MongoDB_obj = MongoDB() def execute(self): time_start = datetime.datetime.now() processes = [] to_visit_save_file = self.save_root + "to_visit_save_file.npy" visited_save_file = self.save_root + "visited_save_file.npy" NoID_save_file = self.save_root + "NoID_save_file.npy" # If output_filename (.npy file only) already exists, continue to work on existing database. Otherwise, create a new database. NoID = [] entity_id_visited = [] entity_id_to_visit = [self.seed_entity_id] if os.path.isfile(visited_save_file): entity_id_visited = list( np.load(visited_save_file, allow_pickle=True)) if os.path.isfile(to_visit_save_file): entity_id_to_visit = list( np.load(to_visit_save_file, allow_pickle=True)) if os.path.isfile(NoID_save_file): NoID = list(np.load(NoID_save_file, allow_pickle=True)) # for document each epoch # Worker Input: seeds_to_visit, entity_id_visited, entity_id_to_visit, EntityType # Worker Output: data_relevant, data_future, entity_id_visited_delta, entity_id_to_visit_delta time_old = datetime.datetime.now() while len(entity_id_to_visit) > 0: process_number = int(len(entity_id_to_visit) / self.batch_size) if process_number == 0: (data_relevant, data_future, entity_id_visited_delta, entity_id_to_visit_delta, NoID_delta) = BFSWorker(entity_id_to_visit, entity_id_visited, entity_id_to_visit, self.entityType, self.kg_api) entity_id_to_visit = [] insert_result = self.MongoDB_obj.save_to_mongodb_many( "RelevantType", data_relevant) insert_result = self.MongoDB_obj.save_to_mongodb_many( "FutureUseType", data_future) entity_id_visited.extend(entity_id_visited_delta) entity_id_to_visit.extend(entity_id_to_visit_delta) NoID.extend(NoID_delta) print("*************************************") print(len(entity_id_to_visit)) print(len(entity_id_visited)) np.save(to_visit_save_file, entity_id_to_visit) np.save(visited_save_file, entity_id_visited) np.save(NoID_save_file, NoID) time_new = datetime.datetime.now() print(time_new - time_start, time_new - time_old) continue if process_number > self.process_number_max: process_number = self.process_number_max while len(entity_id_to_visit) >= (process_number * self.batch_size): parameters = [] data_relevant = [] data_future = [] entity_id_visited_delta = set() entity_id_to_visit_delta = set() NoID_delta = [] # Set parameters for multiprocessing for i in range(process_number): temp = [ entity_id_to_visit[(i * self.batch_size):( (i + 1) * self.batch_size)], entity_id_visited, entity_id_to_visit, self.entityType, self.kg_api ] parameters.append(temp) # Multiprocessing with Pool(process_number) as p: result_workers = p.starmap(BFSWorker, parameters) # delete visited entity in the multiprocessing entity_id_to_visit = entity_id_to_visit[((i + 1) * self.batch_size):] # Merge multiprocessing results for i in result_workers: data_relevant.extend(i[0]) data_future.extend(i[1]) entity_id_visited_delta = entity_id_visited_delta | set( i[2]) entity_id_to_visit_delta = entity_id_to_visit_delta | set( i[3]) NoID_delta.extend(i[4]) # print(len(data_relevant)) # print(len(data_future)) if self.debug: np.save("/home/markzhao/Desktop/results.npy", result_workers) np.save("/home/markzhao/Desktop/data_relevant.npy", data_relevant) np.save("/home/markzhao/Desktop/data_future.npy", data_future) insert_result = self.MongoDB_obj.save_to_mongodb_many( "RelevantType", data_relevant) insert_result = self.MongoDB_obj.save_to_mongodb_many( "FutureUseType", data_future) entity_id_visited.extend(list(entity_id_visited_delta)) entity_id_to_visit.extend(list(entity_id_to_visit_delta)) NoID.extend(NoID_delta) print("*************************************") print(len(entity_id_to_visit)) print(len(entity_id_visited)) if os.path.isfile(to_visit_save_file): os.rename( to_visit_save_file, to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:]) if os.path.isfile(visited_save_file): os.rename( visited_save_file, visited_save_file[:-4] + "_OLD" + visited_save_file[-4:]) if os.path.isfile(NoID_save_file): os.rename( NoID_save_file, NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:]) np.save(to_visit_save_file, entity_id_to_visit) np.save(visited_save_file, entity_id_visited) np.save(NoID_save_file, NoID) time_new = datetime.datetime.now() print(time_new - time_start, time_new - time_old) time_old = time_new print("*************************************") print(len(entity_id_to_visit)) print(len(entity_id_visited)) if os.path.isfile(to_visit_save_file): os.rename( to_visit_save_file, to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:]) if os.path.isfile(visited_save_file): os.rename(visited_save_file, visited_save_file[:-4] + "_OLD" + visited_save_file[-4:]) if os.path.isfile(NoID_save_file): os.rename(NoID_save_file, NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:]) np.save(to_visit_save_file, entity_id_to_visit) np.save(visited_save_file, entity_id_visited) np.save(NoID_save_file, NoID) time_new = datetime.datetime.now() print(time_new - time_start, time_new - time_old)
def process(self): log_str = 'searching...' self.system_logger.info(log_str) print(log_str) input_db = MongoDB(self.db_config['host'], self.db_config['port'], self.db_config['input_db_name'], self.db_config['user'], self.db_config['pwd']) input_collection = input_db.db[self.db_config['article_collection_name']] all_docs = [] for source_category in self.input_source_category: elems = source_category.split("_") source = elems[0] category = elems[1] if category == "all": query = {"source": source, "date": {"$gte": self.date_after}} else: query = {"source": source, "category": category, "date": {"$gte": self.date_after}} log_str = 'searching query' + str(query) self.system_logger.info(log_str) print(log_str) try: ids = input_collection.find(query).distinct('_id') except pymongo.errors.OperationFailure: # TO-DO: fix the following error: # pymongo.errors.OperationFailure: distinct too big, 16mb cap ids = [i['_id'] for i in input_collection.find(query)] all_docs.extend(ids) log_str = '# of docs found: %s' % len(all_docs) self.system_logger.info(log_str) print(log_str) if len(all_docs) == 0: return chunk_size = int(len(all_docs) / self.nworkers) if chunk_size == 0: chunk_size = len(all_docs) output_db = MongoDB(self.db_config['host'], self.db_config['port'], self.db_config['output_db_name'], self.db_config['user'], self.db_config['pwd'] ) sentence_collection = output_db.db[self.db_config['sentence_collection_name']] current_count = sentence_collection.count() if current_count > 0: current_count -= 1 log_str = '# of workers: %s\n chunk size: %s \n' % (self.nworkers, chunk_size) self.system_logger.info(log_str) print(log_str) chunks = [] for i in range(0, len(all_docs), chunk_size): chunks.append(slice(i, i+chunk_size)) log_str = '# parent pid: %s\n processing...\n' % os.getpid() self.system_logger.info(log_str) print(log_str) # Multi-processing pool = multiprocessing.Pool(processes=self.nworkers) thread_id = 0 for c in chunks: args = (self.db_config, self.sport_category_filename, {}, all_docs[c], self.save_batch_size, self.tmp_dir, thread_id) thread_id += 1 pool.apply_async(process_chunk, args=args,) pool.close() pool.join() log_str = 'start merging...' self.system_logger.info('start merging...') print(log_str) # merge some information current_index = current_count + 1 current_sentence_length = 0 current_sentence_number = 0 all_tmp_files = get_files(self.tmp_dir, r".*json") group_insert = [] for index, one_file in enumerate(all_tmp_files): sys.stdout.write("%d / %d\r" % (index, len(all_tmp_files))) json_str = codecs.open(one_file, 'r', 'utf-8').read() insert_sentences = json.loads(json_str) for one_sentence in insert_sentences: _length = int(one_sentence["sentence_length"]) one_sentence["sentence_index"] = current_index group_insert.append(one_sentence) current_index += 1 current_sentence_length += _length current_sentence_number += len(insert_sentences) if len(group_insert) == self.insert_batch_size: sentence_collection.insert(group_insert) group_insert.clear() if len(group_insert) > 0: sentence_collection.insert(group_insert) group_insert.clear() avg_length_entry = list(sentence_collection.find({"_id": "avg_length"})) if len(avg_length_entry) == 1: saved_sentence_length = avg_length_entry[0]["current_sentence_length"] saved_sentence_number = avg_length_entry[0]["current_sentence_number"] current_sentence_length += saved_sentence_length current_sentence_number += saved_sentence_number avg_length = float(current_sentence_length / current_sentence_number) find_query = {"_id": "avg_length"} update_query = {"$set": {"current_sentence_length": current_sentence_length, "current_sentence_number": current_sentence_number, "avg_length": avg_length}} sentence_collection.update_one(find_query, update_query) else: avg_length = float(current_sentence_length / current_sentence_number) sentence_collection.insert({"_id": "avg_length", "current_sentence_length": current_sentence_length, "current_sentence_number": current_sentence_number, "avg_length": avg_length}) if current_index != sentence_collection.count(): self.system_logger.error('sentence index is not equal to the number of sentence.\n') self.system_logger.error('current max sentence index is [%d], but current all sentence number is [%d].\n' % (current_index - 1, sentence_collection.count() - 1)) log_str = 'start indexing...' self.system_logger.info('start indexing...') print("\n", log_str) sentence_collection.create_index('docid') sentence_collection.create_index('import_date') sentence_collection.create_index('news_date') sentence_collection.create_index('entity_len') sentence_collection.create_index('sentence_index') sentence_collection.create_index('sentence_length') sentence_collection.create_index('sentence_position') key = [('entity_set', 1)] pfe = {'entity_set': {'$exists': True}} sentence_collection.create_index(key, partialFilterExpression=pfe) key = [('category_set', 1)] pfe = {'category_set': {'$exists': True}} sentence_collection.create_index(key, partialFilterExpression=pfe) log_str = 'all done' self.system_logger.info('all done') print(log_str) shutil.rmtree(self.tmp_dir)
def process_chunk(db_config, sport_category_filename, query, doc_ids, batch_size, tmp_dir, thread_id): try: sport_category_classifier = SportNewsCategoryClassifier(sport_category_filename) # MongoDB is not fork-safe: # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe input_db = MongoDB(db_config['host'], db_config['port'], db_config['input_db_name'], db_config['user'], db_config['pwd'] ) article_collection = input_db.db[db_config['article_collection_name']] output_db = MongoDB(db_config['host'], db_config['port'], db_config['output_db_name'], db_config['user'], db_config['pwd']) sentence_collection = output_db.db[db_config['sentence_collection_name']] query['_id'] = {'$in': doc_ids} response = article_collection.find(query, no_cursor_timeout=True) batch_number = 0 to_insert = [] all_inserted_sentence = 0 for one_document in response: if len(one_document['content']) == 0: continue one_document_content = [] for line in one_document['content']: if len(line.strip()) > 0: one_document_content.append(line.strip()) if len(one_document_content) == 0: continue docid = one_document['_id'] search_query = {"docid": docid} if len(list(sentence_collection.find(search_query))) > 0: continue # process title title = one_document['title'] title_query = process_title(title) # process document body body_query = process_document_body(one_document_content) category_result = sport_category_classifier.get_category(title_query.full_entity_ids, body_query, one_document['source'], one_document['category']) # generate a sentence from query sentence_to_insert = {"_id": '%s_%s' % (docid, 1)} search_query = {"_id": sentence_to_insert["_id"]} if len(list(sentence_collection.find(search_query))) > 0: continue entity_set = set(title_query.full_entity_ids) if len(entity_set) == 0: continue sentence_length = 0 sentence_tokens = [] for sentence in title_query.sentence_list: sentence_length += sentence.sentence_length for token in sentence.token_list: sentence_tokens.append(token.original_text + "/" + str(token.pos) + "/" + str(token.ner) + "/" + str(token.is_stop_word)) sentence_to_insert['分词词性'] = sentence_tokens sentence_to_insert['docid'] = docid sentence_to_insert['source'] = one_document['source'] sentence_to_insert['category'] = one_document['category'] sentence_to_insert['news_date'] = int(one_document['date']) sentence_to_insert['import_date'] = int(one_document['import_date']) sentence_to_insert['raw_sentence'] = title sentence_to_insert['sentence_length'] = sentence_length sentence_to_insert['sentence_position'] = 1 sentence_to_insert['token_number'] = len(sentence_tokens) sentence_to_insert['entity_set'] = list(entity_set) sentence_to_insert['entity_len'] = len(entity_set) sentence_to_insert["topic_category"] = category_result.res to_insert.append(sentence_to_insert) if body_query is None: continue # generate sentences from body for sentence_index, sentence in enumerate(body_query.sentence_list): sentence_to_insert = {"_id": '%s_%s' % (docid, sentence_index + 2)} search_query = {"_id": sentence_to_insert["_id"]} if len(list(sentence_collection.find(search_query))) > 0: continue entity_set = set(sentence.full_entity_ids) if len(entity_set) == 0: continue sentence_tokens = [] for token in sentence.token_list: sentence_tokens.append(token.original_text + "/" + str(token.pos) + "/" + str(token.ner) + "/" + str(token.is_stop_word)) sentence_to_insert['分词词性'] = sentence_tokens sentence_to_insert['docid'] = docid sentence_to_insert['source'] = one_document['source'] sentence_to_insert['category'] = one_document['category'] sentence_to_insert['news_date'] = int(one_document['date']) sentence_to_insert['import_date'] = int(one_document['import_date']) sentence_to_insert['raw_sentence'] = sentence.raw_sentence sentence_to_insert['sentence_length'] = sentence.sentence_length sentence_to_insert['sentence_position'] = sentence_index + 2 sentence_to_insert['token_number'] = len(sentence_tokens) sentence_to_insert['entity_set'] = list(entity_set) sentence_to_insert['entity_len'] = len(entity_set) sentence_to_insert["topic_category"] = category_result.res to_insert.append(sentence_to_insert) if len(to_insert) >= batch_size: tmp_file = tmp_dir + "/thread_" + str(thread_id) + "_batch_" + str(batch_number) + ".json" batch_number += 1 f = codecs.open(tmp_file, 'w', 'utf-8') f.write(json.dumps(to_insert)) f.close() print("Successfully saved one batch ([%d] sentences) in file [%s]!" % (len(to_insert), tmp_file)) all_inserted_sentence += len(to_insert) to_insert.clear() if len(to_insert) > 0: tmp_file = tmp_dir + "/thread_" + str(thread_id) + "_batch_" + str(batch_number) + ".json" f = codecs.open(tmp_file, 'w', 'utf-8') f.write(json.dumps(to_insert)) f.close() print("Successfully saved rest [%d] sentences in file [%s]!" % (len(to_insert), tmp_file)) all_inserted_sentence += len(to_insert) to_insert.clear() log_str = "One thread finished! [%d] sentences are saved!" % all_inserted_sentence logging.getLogger("system_log").info(log_str) print(log_str) except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s' % \ (exc_type, exc_obj, exc_tb.tb_lineno) print(msg)
class SaveEntity2DB(OfflineTools): def __init__(self, local_configure, global_configure): super().__init__() self.global_configure = global_configure self.local_configure = local_configure self.db_name = local_configure["db_name"] self.collection_name = local_configure["collection_name"] self.data_dir = local_configure["data_dir"] self.db_interface = MongoDB(db_server_ip="10.93.128.143", db_server_port=27017, database_name=self.db_name) self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U) self.system_logger = logging.getLogger("system_log") pass def execute(self): for source, data_path in self.data_dir.items(): if source == "douban_movie": self.process_douban_movie(data_path) elif source == "qq_music": self.process_qq_music(data_path) else: sys.stdout.write("[%s] is not supported at this moment!" % source) continue def is_han(self, text): return any('\u4e00' <= char <= '\u9fff' for char in text) def process_douban_movie(self, data_path): current_category_dirs = os.listdir(data_path) for category in current_category_dirs: if category in ["configures", "errors", "category_list_json"]: continue category_dir = os.path.join(data_path, category) + "/" log_str = "\nProcessing: %s\n" % category_dir sys.stdout.write(log_str) self.system_logger.info(log_str) all_json_file = get_files(category_dir, r'.*json') for index, filename in enumerate(all_json_file): sys.stdout.write("%d / %d\r" % (index, len(all_json_file))) json_str = codecs.open(filename, 'r', 'utf-8').read() json_obj = json.loads(json_str) json_obj["_id"] = "douban_" + json_obj["id"] json_obj["web_category"] = category json_obj["entity_type"] = "Movie" try: self.db_interface.save_to_mongodb(self.collection_name, json_obj) except: output_str = json.dumps(json_obj) self.system_logger.info( "Errors writing following object into DB: \n" + output_str + "\n") sys.stderr.write("Error writing object into DB\n") sys.exit() def process_qq_music(self, data_path): current_category_dirs = os.listdir(data_path) for category in current_category_dirs: if category not in [ "内地", "台湾", "日本", "新加坡", "泰国", "韩国", "香港", "马来西亚" ]: continue category_dir = os.path.join(data_path, category) + "/" sys.stdout.write("\nProcessing: %s\n" % category_dir) all_json_file = get_files(category_dir, r'.*_albums.json') for index, filename in enumerate(all_json_file): sys.stdout.write("%d / %d\r" % (index, len(all_json_file))) singer_filename = filename.replace("_albums", "") if not os.path.exists(singer_filename): sys.stdout.write( "\n[%s] does not exist in directory [%s]\n" % (singer_filename, category_dir)) continue json_str = codecs.open(singer_filename, 'r', 'utf-8').read() json_obj = json.loads(json_str) json_obj["_id"] = "qq_music_singer_" + json_obj["singer_mid"] json_obj["entity_type"] = "singer" try: self.db_interface.save_to_mongodb(self.collection_name, json_obj) except: output_str = json.dumps(json_obj) self.system_logger.info( "Errors writing following object into DB: \n" + output_str + "\n") sys.stderr.write("Error writing object into DB\n") sys.exit()
def import_sents(pdata, name): try: client = MongoDB(host, port, db_name, user, pwd) collection = client.db[collection_name] sents = [] with open(pdata, 'r') as f: for line in f: d = json.loads(line) for sent in d['sentences']: ids = set() titles = set() ids_ll = set() titles_ll = set() for n, i in enumerate(sent['links']): if i['id']: ids.add(i['id']) titles.add(i['title']) if langlinks and i['title'] in langlinks: title_ll, id_ll = langlinks[i['title']] sent['links'][n]['id_ll'] = id_ll sent['links'][n]['title_ll'] = title_ll ids_ll.add(id_ll) titles_ll.add(title_ll) sent['ids_len'] = 0 if ids: sent['ids'] = list(ids) sent['ids_len'] = len(ids) sent['ids_ll_len'] = 0 if ids_ll: sent['ids_ll'] = list(ids_ll) sent['ids_ll_len'] = len(ids_ll) if titles: sent['titles'] = list(titles) if titles_ll: sent['titles_ll'] = list(titles_ll) sent['source_id'] = d['id'] sent['source_title'] = d['title'] if sent['source_title'] in langlinks: title_ll, id_ll = langlinks[sent['source_title']] sent['source_id_ll'] = id_ll sent['source_title_ll'] = title_ll sent['_chunk_id'] = name sents.append(sent) if sents: # Insert a list is faster than insert_one # Reduce the size of the list to reduce RAM usage collection.insert(sents) # Indexing collection.create_index('_chunk_id') collection.create_index('source_id') collection.create_index('source_title') collection.create_index('source_id_ll', sparse=True) collection.create_index('source_title_ll', sparse=True) collection.create_index('start') collection.create_index('end') collection.create_index('ids_len') collection.create_index('ids_ll_len') key = [('ids', 1)] pfe = {'ids': {'$exists': True}} collection.create_index(key, partialFilterExpression=pfe) key = [('ids_ll', 1)] pfe = {'ids_ll': {'$exists': True}} collection.create_index(key, partialFilterExpression=pfe) key = [('titles', 1)] pfe = {'titles': {'$exists': True}} collection.create_index(key, partialFilterExpression=pfe) key = [('titles_ll', 1)] pfe = {'titles_ll': {'$exists': True}} collection.create_index(key, partialFilterExpression=pfe) key = [('source_id', 1), ('ids', 1)] pfe = {'ids': {'$exists': True}} collection.create_index(key, partialFilterExpression=pfe) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s | %s | %s' % \ (exc_type, exc_obj, exc_tb.tb_lineno, name, d['title']) logger.error(msg)
def create_metion_table(self): def strip_mention(text): text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') text = text.lower().strip() text = text.replace('\\', '') text = ' '.join(text.split()) return text def expand_mention(text): RE_STRIP = r' \([^)]*\)|\<[^)]*\>|,|"|\.|\'|:|-' # STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', # 'from', 'has', 'he', 'i', 'in', 'is', 'it', 'its', 'of', 'on', # 'that', 'the', 'their', 'we', 'to', 'was', 'were', 'with', # 'you', 'your', 'yours', 'our', 'ours', 'theirs', 'her', # 'hers', 'his', 'him', 'mine', 'or', 'but', 'though', 'since'] res = [] # Strip mention res.append(''.join(re.sub(RE_STRIP, '', text).strip().split())) # # Remove stop words # res.append(' '.join([word for word in text.split() \ # if word not in STOP_WORDS]).strip()) # '·' in Chinese names if '·' in text: res.append(text.replace('·', '-')) return res def filter_mention(mention): if not mention: return False if mention == '': return False return True def get_kbid2mention(data): res = defaultdict(lambda: defaultdict(int)) for mention in data: for kbid in data[mention]: assert type(data[mention][kbid]) == int res[kbid][mention] = data[mention][kbid] return res def add_score(data): for mention in data: c = Counter(data[mention]) tol = sum(c.values()) assert type(tol) == int for kbid in data[mention]: data[mention][kbid] = data[mention][kbid] / tol kbid2names, _, _, kbid2popularity = self.get_names(MENTION_PROPS) mention2kbid = defaultdict(lambda: defaultdict(int)) for kbid in kbid2names: for name in kbid2names[kbid]: mention = strip_mention(name) mentions = [mention] mentions.extend(expand_mention(mention)) mentions = set(mentions) for m in mentions: if not filter_mention(m): continue # mention2kbid[m][kbid] += kbid2names[kbid][name] try: mention2kbid[m][kbid] += kbid2popularity[kbid] except KeyError: mention2kbid[m][kbid] += 1 mention2kbid = dict(mention2kbid) with open('%s/mention2kbid_raw.json' % self.output_dir, 'w') as fw: json.dump(mention2kbid, fw, indent=4) self.system_logger.info('converting kbid2mention..') kbid2mention = get_kbid2mention(mention2kbid) self.system_logger.info('done.') with open('%s/kbid2mention_raw.json' % self.output_dir, 'w') as fw: json.dump(kbid2mention, fw, indent=4) self.system_logger.info('computing mention2kbid...') add_score(mention2kbid) with open('%s/mention2kbid.json' % self.output_dir, 'w') as fw: json.dump(mention2kbid, fw, indent=4) self.system_logger.info('done.') self.system_logger.info('computing kbid2mention...') add_score(kbid2mention) with open('%s/kbid2mention.json' % self.output_dir, 'w') as fw: json.dump(kbid2mention, fw, indent=4) self.system_logger.info('done.') # start insert into mongo db self.system_logger.info('db name: %s' % self.db_config['db_name']) self.system_logger.info('collection name: %s' % self.db_config["output_collection_name"]) client = MongoDB(self.db_config['host'], self.db_config['port'], self.db_config['db_name'], self.db_config['user'], self.db_config['pwd']) self.system_logger.info('drop collection') client.db.drop_collection(self.db_config['output_collection_name']) collection = client.db[self.db_config['output_collection_name']] self.system_logger.info('processing...') to_insert = [] self.system_logger.info('converting...') # TO-DO: save RAM for mention in mention2kbid: if sys.getsizeof(mention) >= 512: self.system_logger.warning('mention is too large, skip') continue entities = sorted(mention2kbid[mention].items(), key=lambda x: x[1], reverse=True) ins = {'mention': mention, 'entities': entities} to_insert.append(ins) self.system_logger.info('importing...') try: collection.insert(to_insert) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() msg = 'unexpected error: %s | %s | %s' % \ (exc_type, exc_obj, exc_tb.tb_lineno) self.system_logger.error(msg) self.system_logger.info('done.') self.system_logger.info('indexing...') collection.create_index('mention', unique=True) # collection.create_index([('mention', 1), ('entities', 1)], unique=True) self.system_logger.info('done.') self.system_logger.info(collection) self.system_logger.info(collection.count())