class AutoTransferHandler(): def __init__(self, database_name, collection_name_from, collection_name_to): self.config_from = MongoConfig(database_name=database_name, collection_name=collection_name_from, use_localhost=False) self.config_to = MongoConfig(database_name=database_name, collection_name=collection_name_to, use_localhost=False) self.cursor_from = self.config_from.GetMongoCursor() self.cursor_to = self.config_to.GetMongoCursor() self.count = 0 def start(self): print('Transfer Begin') for document in self.cursor_from.find(): self.cursor_to.insert_one(document) self.count += 1 print('Insert success document {}'.format(self.count)) print('Transfer Done.') self.config_from.CloseConnection() self.config_to.CloseConnection()
def Summary(): config = MongoConfig(database_name='TechHub', collection_name='zhihu', use_localhost=False) cursor = config.GetMongoCursor() summary_handler = AutoSummaryHandler() count = 0 for document in cursor.find(): content = document['content'] url = document['url'] if len(content) < 10: cursor.delete_one({"url": url}) print('Remove too short document') else: summary = summary_handler.GetSummary(content=content) cursor.update_one({'url': url}, {'$set': {'summary': summary}}) print('Finish summary for document {}'.format(count)) count += 1 # release resources config.CloseConnection()
from common.Config import MongoConfig if __name__ == '__main__': config = MongoConfig(database_name='TechHub', collection_name='CSDN') cursor = config.GetMongoCursor() count = 0 for document in cursor.find(): url = document['url'] new_time = document['date'].split()[0] new_time = new_time.replace("年", "-") new_time = new_time.replace("月", "-") new_time = new_time.replace("日", "") if len(new_time) != 10: raise Exception("error") cursor.update_one({'url': url}, {'$set': {'date': new_time}}) count += 1 print('Finish update document {}'.format(count)) print("done") config.CloseConnection()
class DataFilterHandler(): def __init__(self, database_name, collection_name, use_localhost=False, use_similarity_filter=True): self.mongo_config = MongoConfig(database_name=database_name, collection_name=collection_name, use_localhost=use_localhost) self.collection_cursor = self.mongo_config.GetMongoCursor() self.use_similarity_filter = use_similarity_filter self.similarity_filter = SimHashFilter(128) self.count = 0 self.code_list = [] # filter with distinct url and similarity def start(self): cursor = self.collection_cursor url_set = set() for document in cursor.find(): url = document['url'] content = document['content'] # first filter same url if url in url_set and len(url_set) != 0: cursor.remove({"url": url}) print('Duplicate url for {}'.format(url)) continue url_set.add(url) # then filter similarity threshold if self.use_similarity_filter == True: self.FilterSimilarity(cursor, url, content) else: print('Valid document for position: {}'.format(self.count)) self.count += 1 self.mongo_config.CloseConnection() print('Data filter done.') def FilterSimilarity(self, cursor, url, content): valid_flag = True code = self.similarity_filter.GetCodeForText(text=content) if len(self.code_list) == 0: self.code_list.append(code) else: list_length = len(self.code_list) for index in range(list_length): c = self.code_list[index] (similar, similarity) = self.similarity_filter.IsSimilarByCode(c, code) if similar == True: cursor.remove({"url": url}) print('Delete document for {}, similarity is {}'.format( url, similarity)) valid_flag = False break else: continue if valid_flag == True: print('Valid document for position: {}'.format(self.count)) self.code_list.append(code) self.count += 1