def execute(self): config = self.config coll = config.tests.test_collection self.set_source("dirname/test.txt") coll.insert_one({"hello": "world"}) data = [] for doc in coll.find(): data.append(doc) assert len(data) == 1 assert data[0]["_job_id"] == self._id assert data[0]["_src"] == "test.txt" coll.insert_many([ {"hello": 1}, {"hello": 2}, ]) from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateOne, \ UpdateMany, DeleteMany requests = [InsertOne({'hello': 3}), DeleteOne({'x': 1}), ReplaceOne({'hello': 1}, {'hello': 1.234}, upsert=True), UpdateOne({'hello': 2}, {'$set': {'hello': 2.234}}, upsert=True), UpdateMany({'hello': 3}, {'$set': {'hello': 3.234}}, upsert=True), DeleteMany({'x': 1}) ] coll.bulk_write(requests) requests = [InsertOne({'hello': 4}), DeleteOne({'x': 1}), ReplaceOne({'hello': 1.234}, {'zz': 1}, upsert=True), UpdateOne({'hello': 2.234}, {'$set': {'zz': 2}}, upsert=True), UpdateMany({'hello': 3.234}, {'$set': {'zz': 3}}, upsert=True), DeleteMany({'x': 1})] coll.bulk_write(requests) coll.update_one({"hello": 5}, {"$set": {"ua": 1}}, upsert=True) coll.update_many({"hello": "5"}, {"$set": {"ua": 2}}, upsert=True) data = [] for doc in coll.find(): data.append(doc) assert len(data) == 7 assert set([d["_job_id"] for d in data]) == {self._id} assert set([d["_src"] for d in data]) == {"test.txt"}
def cmd_purge(fetcher=None, dataset=None, purge_all=False, **kwargs): """Purge one or more dataset""" """ dlstats fetchers purge -f INSEE --purge-all dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP -d IPC-2015-COICOP """ ctx = client.Context(**kwargs) ctx.log("START purge for [%s]" % fetcher) if ctx.silent or click.confirm('Do you want to continue?', abort=True): start = time.time() db = ctx.mongo_database() from pymongo import DeleteMany if purge_all: query = {"name": fetcher} result = db[constants.COL_PROVIDERS].bulk_write( [DeleteMany(query)], ordered=False) ctx.log("Provider [%s] deleted" % fetcher) query = {"provider_name": fetcher} result = db[constants.COL_CATEGORIES].bulk_write( [DeleteMany(query)], ordered=False) ctx.log("Categories deleted: %s" % result.deleted_count) query = {"provider_name": fetcher} if not purge_all and dataset: query["dataset_code"] = {"$in": dataset} bulk_requests = [DeleteMany(query)] result = db[constants.COL_DATASETS].bulk_write(bulk_requests, ordered=False) ctx.log("Datasets deleted: %s" % result.deleted_count) result = db[constants.COL_SERIES].bulk_write(bulk_requests, ordered=False) ctx.log("Series deleted: %s" % result.deleted_count) result = db[constants.COL_SERIES_ARCHIVES].bulk_write(bulk_requests, ordered=False) ctx.log("Series archives deleted: %s" % result.deleted_count) end = time.time() - start ctx.log("END purge for [%s] - time[%.3f]" % (fetcher, end))
def func2(): """ 批量插入文档函数 """ client = MongoClient('mongodb://127.0.0.1:27017/') # 建立连接 collection = client['blogdb'].get_collection( 'posts', write_concern=WriteConcern(w=1, j=True, wtimeout=1)) # 选择集合 # write_concern控制何时调用getLastError() # write_concern=1:mongod在写入内存之后,返回响应 # write_concern=1 & journal:true:mongod在写入内存、journal日志之后,返回响应 # write_concern=2:在集群模式生效,2时表示只有secondary从primary完成复制之后,返回响应 try: insertData = [InsertOne({'title': i}) for i in range(4)] # 插入文档 otherData = [ DeleteMany({}), # Remove all documents. InsertOne({'_id': 1}), InsertOne({'_id': 2}), InsertOne({'_id': 3}), UpdateOne({'_id': 1}, {'$set': { 'foo': 'bar' }}), UpdateOne({'_id': 4}, {'$inc': { 'j': 1 }}, upsert=True), ReplaceOne({'j': 1}, {'j': 2}), DeleteOne({'_id': 2}) ] collection.bulk_write(otherData + insertData, ordered=True) except BulkWriteError as bwe: print(bwe.details)
def delete(self, date): if isinstance(date, string_types) or isinstance(date, datetime): return self.collection.delete_many(date_range(date)).deleted_count else: if len(date): return self.collection.bulk_write( [DeleteMany(date_range(d)) for d in date]).deleted_count
def start_requests(self): password = input("GameLog Password: "******"mongodb+srv://kevin:" + password + "@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority" client = pymongo.MongoClient(uri) db = client.NBA_Match_Ups dbTest = db.test dbTest.bulk_write([ DeleteMany({}), # Remove all documents ]) players = db.Players.find({}, { 'basketballreference_page': 1, "_id": 0 }) #Project only the url of basketball reference page listOfUrls = [] #Empty List for document in players: #For loop to go through the entire Cursor listOfUrls.append( document['basketballreference_page']) #Add the url to the list # List of Urls to go through urls = listOfUrls #urls = [#"https://www.basketball-reference.com/players/l/lillada01/gamelog/2013"] # "https://www.basketball-reference.com/players/l/lillada01.html"] for url in urls: # Run parse for each url in urls yield scrapy.Request(url=url, callback=self.parsePlayers)
async def delete(self, models: ModuleType, indexer: str = "slug"): children = {} parent = await self.ancestors(models, True) if parent: self_class = self.__class__.__name__ for field in fields(parent): if "model" in field.metadata and field.metadata[ "model"] == self_class: childs = getattr(parent, field.name) childs.remove( getattr( self, "_id" if field.type == List[ObjectId] else indexer)) children[field.name] = childs actions = [ DeleteOne({"_id": self._id}), DeleteMany({"path": { "$regex": f"^{self.get_url()}" }}) ] if children: actions.append(UpdateOne({"_id": parent._id}, {"$set": children})) async with await self._table.database.client.start_session() as s: async with s.start_transaction(): await self._table.bulk_write(actions) self.id_ = None
def update_orbits(self, username: str, fleets: List[Fleet]): requests = [ DeleteMany({'username': username, 'sourceType': SourceType.planet.name}), ] for fleet in fleets: requests.append(InsertOne(serialize(fleet))) self.collection.bulk_write(requests)
def gen_bulk_operations(iter, turncate=False): ''' @param iter: The iterable object will be insert into DB. @param turncate: If True, a DeleteMany operation will be applied before other operations. @return: A list of BulkOperation subclass. ''' return ([DeleteMany({})] if turncate else []) + \ [InsertOne(r) for r in iter]
def callback(client): client.db.collection.bulk_write([ InsertOne({}), InsertOne({}), UpdateOne({}, {'$inc': { 'x': 1 }}), DeleteMany({}) ])
def save_schedule(specialty, schedules): requests = [] specialty = specialties_coll.find_one_and_update( specialty, {'$set': specialty}, upsert=True, return_document=ReturnDocument.AFTER) requests.append(DeleteMany({'specialty': specialty['_id']})) for s in schedules: doc = s.copy() doc['specialty'] = specialty['_id'] requests.append(InsertOne(doc)) return schedule_coll.bulk_write(requests)
def admin_create_new_api_key(email): api_key = tokening.generate_random_key(length=64) operations = [ DeleteMany({'email': email}), InsertOne({ 'email': email, 'api_key': api_key }) ] admin_api_keys_collection.bulk_write(operations, ordered=True) return api_key
def handle_clean(self, options, events): before = options.get("before") if before: datetime.datetime.strptime(before, "%Y-%m-%d") else: self.print("Before is not set, use default") before = datetime.datetime.now() - DEFAULT_CLEAN force = options.get("force") aa = ActiveAlarm._get_collection() ah = ArchivedAlarm._get_collection() ae = ActiveEvent._get_collection() event_ts = ae.find_one({"timestamp": {"$lte": before}}, limit=1, sort=[("timestamp", 1)]) event_ts = event_ts["timestamp"] print("[%s] Cleaned before %s ... \n" % ("events", before), end="") bulk = [] window = CLEAN_WINDOW while event_ts < before: refer_event_ids = [] for e in [aa, ah]: for ee in e.find( {"timestamp": {"$gte": event_ts, "$lte": event_ts + CLEAN_WINDOW}}, {"opening_event": 1, "closing_event": 1}, ): if "opening_event" in ee: refer_event_ids += [ee["opening_event"]] if "closing_event" in ee: refer_event_ids += [ee["closing_event"]] try: clear_qs = { "timestamp": {"$gte": event_ts, "$lte": event_ts + CLEAN_WINDOW}, "_id": {"$nin": refer_event_ids}, } self.print( "Interval: %s, %s; Count: %d" % (event_ts, event_ts + CLEAN_WINDOW, ae.count(clear_qs)) ) bulk += [DeleteMany(clear_qs)] event_ts += window if window != CLEAN_WINDOW: window = CLEAN_WINDOW except DocumentTooLarge: window = window // 2 if window < datetime.timedelta(hours=1): self.die("Too many events for delete in interval %s" % window) event_ts -= window if force: self.print("All data before %s from active events will be Remove..\n" % before) for i in reversed(range(1, 10)): self.print("%d\n" % i) time.sleep(1) ae.bulk_write(bulk)
def start_requests(self): password = input("Password: "******"mongodb+srv://kevin:"+password+"@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority" client = pymongo.MongoClient(uri) db = client.NBA_Match_Ups.Teams db.bulk_write([ DeleteMany({}), # Remove all documents ]) urls = [ # List of Urls to go through "https://www.basketball-reference.com/teams/" ] for url in urls: # Run parse for each url in urls yield scrapy.Request(url=url, callback=self.parseTeamIndex)
def clean_toc(db): logger.info('cleaning table of contents') out_of_sync_tocs = db.toc.find({'synced': False}, { '_id': False, 'collection': True, 'path': True }) for doc in out_of_sync_tocs: db[doc['collection']].delete_many({'path': doc['path']}) bulk = [DeleteMany({'synced': False})] try: db.toc.bulk_write(bulk) except BulkWriteError as e: logger.error(e)
def start_requests(self): uri = "mongodb+srv://kevin:" + password + "@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority" client = pymongo.MongoClient(uri) db = client.NBA_Match_Ups.Players db.bulk_write([ DeleteMany({}), # Remove all documents ]) urls = [ # List of Urls to go through "https://www.basketball-reference.com/leagues/NBA_2019.html" #"https://www.basketball-reference.com/players/l/lillada01.html" ] for url in urls: # Run parse for each url in urls yield scrapy.Request(url=url, callback=self.parseLeague)
def trigger(email): # this function can be used for the initial send as well as resending helper_account = helper_accounts_collection.find_one({'email': email}) if helper_account['account']['email_verified']: return formatting.status('email already verified') # Generate new token verification_token = tokening.generate_random_key(length=64) helper_id = ObjectId(helper_account["_id"]) # Create new token record record = {'helper_id': helper_id, 'token': verification_token} operations = [DeleteMany({'helper_id': helper_id}), InsertOne(record)] email_tokens_collection.bulk_write(operations, ordered=True) # Trigger token-email return send(email, verification_token)
def delete_many(self, **kwargs): ''' Delete many documents. person1 = Person.new(name='joe', age=30) person2 = Person.new(name='jill', age=31) person3 = Person.new(name='bob', age=50) bulk = Person.bulk() bulk.delete_many(age__gt=30) bulk.save() # Now jill and bob are gone, having age > 30 :param **kwargs: the query to run the delete many with :return: the ``pymongo.DeleteMany`` result ''' query, _ = self.klass._build_query(kwargs) delete = DeleteMany(query) self.ops.append(delete) return delete
def upload_meetings(db, documents): deleted_set = set() write_ops = [] # Delete the old meetings first for doc in documents: course_marker = (doc['semester'], doc['year'], doc['courseId']) if course_marker not in deleted_set: write_ops.append(DeleteMany( { 'courseId': doc['courseId'], 'semester': doc['semester'], 'year': doc['year'], } )) deleted_set.add(course_marker) # Then add the new documents result = db['meetings'].bulk_write(write_ops) print('[Worker] Deleted', result.deleted_count, 'in meetings') result = db['meetings'].insert_many(documents) print('[Worker] Added', len(result.inserted_ids), 'in meetings')
def writeFullXch(cnc, prd=False): if (prd): xdays = dtm.utcnow() - dtm(2016, 1, 1, 1, 1, 1, 111) xdays = xdays.days else: xdays = 5 xdict = scrapeXchSet(cnc, ndays=xdays) if (xdict): odo = cnxCH.bulk_write( [DeleteMany({'_id': xdict['_id']}), InsertOne(xdict)]) else: odo = False cnxLog.insert_one({ 'env': prd, 'module': 'writeFullXch', 'epoch': dtm.utcnow(), 'gist': str(odo), 'service': 'xch', 'app': 'S-Ticker' }) return odo
def delete_many(self, collection: str, doc_ids: List[ObjectId]): """ Delete documents from the database. Parameters ---------- collection: str The db collection to delete document from. doc_ids: List[ObjectId] The list of document ids to delete. """ delete_request = DeleteMany({"_id": {"$in": doc_ids}}) delete_many_results = self._db.get_collection(collection).bulk_write( [delete_request] ) delete_msg = ( f"Deleted {delete_many_results.deleted_count}/{len(doc_ids)} {collection}." ) if delete_many_results.deleted_count != len(doc_ids): log.error(delete_msg) else: log.info(delete_msg)
def test_update_organisations(self, get_collection_mock): task = Mock() records = [{ ORG_UNIQUE_FIELD: "1", "text": "data" }, { ORG_UNIQUE_FIELD: "12", "text": "data data" }] update_organisations(task, records) get_collection_mock.assert_called_once_with( collection_name=TREASURY_ORG_COLLECTION) get_collection_mock.return_value.bulk_write.assert_called_once_with([ UpdateOne({ORG_UNIQUE_FIELD: "1"}, {"$set": records[0]}, upsert=True), UpdateOne({ORG_UNIQUE_FIELD: "12"}, {"$set": records[1]}, upsert=True), DeleteMany({'edrpou_code': { '$nin': ['1', '12'] }}) ])
def update_organisations(task, records): collection = get_collection(collection_name=TREASURY_ORG_COLLECTION) operations = [] codes = [] for org in records: code = org[ORG_UNIQUE_FIELD] codes.append(code) operations.append( UpdateOne({ORG_UNIQUE_FIELD: code}, {"$set": org}, upsert=True)) if codes: operations.append( DeleteMany({ORG_UNIQUE_FIELD: { "$nin": codes }}) # delete codes not on the list ) try: result = collection.bulk_write(operations) except PyMongoError as e: logger.exception(e, extra={"MESSAGE_ID": "MONGODB_ACCESS_ERROR"}) raise task.retry() else: return result.bulk_api_result
# 搜索 # post1 = posts.find_one({"author": "Mike"}) # print(post1) # post1 = posts.find_one({"_id": post_id}) # print(post1) # post1 = posts.find_one({'_id': ObjectId("5a07a674dcfba13028c7022b")}) # print(post1) # posts.remove({"author": "Mike"}) # bulk insert # ids = posts.insert_many([{'i': i} for i in range(10000)]).inserted_ids # print("count of posts:", posts.count()) posts.remove({}) result = posts.bulk_write([ DeleteMany({}), # Remove all documents from the previous example. InsertOne({'_id': 1}), InsertOne({'_id': 2}), InsertOne({'_id': 3}), UpdateOne({'_id': 1}, {'$set': { 'foo': 'bar' }}), UpdateOne({'_id': 4}, {'$inc': { 'j': 1 }}, upsert=True), ReplaceOne({'j': 1}, {'j': 2}) ]) pprint(result.bulk_api_result) for post in posts.find():
async def importfbans_func(message, fed, strings, document=None): global user_id file_type = os.path.splitext(document['file_name'])[1][1:] if file_type == 'json': if document['file_size'] > 1000000: await message.reply(strings['big_file_json'].format(num='1')) return elif file_type == 'csv': if document['file_size'] > 52428800: await message.reply(strings['big_file_csv'].format(num='50')) return else: await message.reply(strings['wrong_file_ext']) return f = await bot.download_file_by_id(document.file_id, io.BytesIO()) msg = await message.reply(strings['importing_process']) data = None if file_type == 'json': try: data = rapidjson.load(f).items() except ValueError: return await message.reply(strings['invalid_file']) elif file_type == 'csv': data = csv.DictReader(io.TextIOWrapper(f)) real_counter = 0 queue_del = [] queue_insert = [] current_time = datetime.now() for row in data: if file_type == 'json': user_id = row[0] data = row[1] elif file_type == 'csv': if 'user_id' in row: user_id = int(row['user_id']) elif 'id' in row: user_id = int(row['id']) else: continue else: raise NotImplementedError new = { 'fed_id': fed['fed_id'], 'user_id': user_id } if 'reason' in row: new['reason'] = row['reason'] if 'by' in row: new['by'] = int(row['by']) else: new['by'] = message.from_user.id if 'time' in row: new['time'] = datetime.fromtimestamp(int(row['time'])) else: new['time'] = current_time if 'banned_chats' in row and type(row['banned_chats']) is list: new['banned_chats'] = row['banned_chats'] queue_del.append(DeleteMany( {'fed_id': fed['fed_id'], 'user_id': user_id})) queue_insert.append(InsertOne(new)) if len(queue_insert) == 1000: real_counter += len(queue_insert) # Make delete operation ordered before inserting. if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) await db.fed_bans.bulk_write(queue_insert, ordered=False) queue_del = [] queue_insert = [] # Process last bans real_counter += len(queue_insert) if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) if queue_insert: await db.fed_bans.bulk_write(queue_insert, ordered=False) await msg.edit_text(strings['import_done'].format(num=real_counter))
def reducer(self, key, values): """ Cleans the metering data: - gets "acumulated or instant" values - removes negative and outliers - detects gaps - generates daily dataframe :param key: the device :param values: the information :return: """ #create dataframe with the values: df = pd.DataFrame.from_records( values, columns=["ts", "value", "accumulated", "energytype", "source"]) # group it by source and energyType source_group = df.groupby('source') for source, df_source_group in source_group: etype_group = df_source_group.groupby('energytype') for etype, df_etype_group in etype_group: df_etype_group = df_etype_group.set_index('ts') df_etype_group = df_etype_group.sort_index() df_etype_group['ts'] = df_etype_group.index # save billing information in raw_data # save billing information in raw_data raw_data = df_etype_group[["ts", "value", "accumulated"]].to_dict('records') for r in raw_data: r.update({ "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }) ops = [InsertOne(x) for x in raw_data] result = self.mongo['raw_data'].bulk_write([ DeleteMany({ "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }), ] + ops) # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "metering"}, { "$set" : { # "device": key, "source": source, "energy_type": etype, "companyId": self.companyId, # "raw_data":df_etype_group[["ts","value","accumulated"]].to_dict('records') # } # }, upsert=True) # self.mongo['raw_data'].update( # {"device": key, "source": source, "energy_type": etype, "data_type": "metering"}, # {"$unset": {"errors": 1}}, # upsert=True) # check if metering is acumulated or instant: duplicated_index = df_etype_group.index.duplicated(keep='last') duplicated_values = df_etype_group[ duplicated_index].index.values.tolist() df_etype_group = df_etype_group[~duplicated_index] freq = calculate_frequency(df_etype_group) if not freq: self.mongo['clean_data'].update( { "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }, {"$set": { "errors": "can't infere frequency" }}, upsert=True) continue day_delta = timedelta(days=1) if df_etype_group.value.isnull().all(): # accumulated df_etype_group = df_etype_group[['accumulated']] if freq < day_delta: # sub-daily frequency df_etype_group = df_etype_group.resample( "D").max().interpolate().diff( 1, 0).rename(columns={"accumulated": "value"}) else: # super-daily frequency df_etype_group = df_etype_group.resample( "D").interpolate().diff( 1, 0).rename(columns={"accumulated": "value"}) elif df_etype_group.accumulated.isnull().all(): #instant df_etype_group = df_etype_group[['value']] if freq < day_delta: # sub-daily frequency df_etype_group = df_etype_group.resample("D").sum() else: # super-daily frequency df_etype_group.value = df_etype_group.value.cumsum() df_etype_group = df_etype_group.resample( "D").interpolate().diff(1, 0) else: self.mongo['clean_data'].update( { "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }, { "$set": { "errors": "device with accumulated and instant values at the same metering" } }, upsert=True) continue df_etype_group['ts'] = df_etype_group.index #max_threshold = self.config['max_threshold'][etype] * 24 if etype in self.config['max_threshold'] else self.config['max_threshold']['default'] * 24 #max_outlier_bool = dc.detect_max_threshold_outliers(df_etype_group['value'], max_threshold) #df_etype_group['value'] = dc.clean_series(df_etype_group['value'], max_outlier_bool) negative_values_bool = dc.detect_min_threshold_outliers( df_etype_group['value'], 0) df_etype_group['value'] = dc.clean_series( df_etype_group['value'], negative_values_bool) #znorm_bool = dc.detect_znorm_outliers(df_etype_group['value'], 30, mode="global") #df_etype_group['value'] = dc.clean_series(df_etype_group['value'], znorm_bool) #max_outliers = list(df_etype_group[max_outlier_bool].index) negative_outliers = list( df_etype_group[negative_values_bool].index) #znorm_outliers = list(df_etype_group[znorm_bool].index) missing_values = list( df_etype_group[df_etype_group.value.isnull()].index) clean_data = df_etype_group[['ts', 'value']].to_dict('records') for r in clean_data: r.update({ "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }) ops = [InsertOne(x) for x in clean_data] result = self.mongo['clean_data'].bulk_write([ DeleteMany({ "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }), ] + ops) self.mongo['data_quality'].update( { "device": key, "source": source, "energy_type": etype, "data_type": "metering", "freq": "D" }, { "$set": { "duplicated_values": duplicated_values, "frequency": freq.resolution, "gaps": missing_values, "negative_values": negative_outliers } }, upsert=True) # # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "metering"}, # {"$set": # { # "clean_data": df_etype_group[['ts','value']].to_dict('records'), # "negative_values": negative_outliers, # "znorm_outliers": znorm_outliers, # "max_outliers": max_outliers, # "gaps": missing_values, # "frequency": freq.resolution, # "duplicated_values": duplicated_values # } # }, upsert=True) for row in df_etype_group.iterrows(): yield None, "\t".join([ str(row[1]['ts'].timestamp()), key, str(row[1]['value']), etype, source ])
async def retry_messages_job(shared_stats): """ Each few minutes, try to handle message that were added to the pending queue (Unavailable messages).""" seen_ids = {} actions = [] messages_actions = [] gtasks: List[Coroutine] = [] tasks = [] loop = asyncio.get_event_loop() i = 0 j = 0 find_params = {} # if await PendingTX.collection.count_documents({}) > 500: # find_params = {'message.item_type': 'inline'} while await PendingMessage.collection.count_documents(find_params): async for pending in PendingMessage.collection.find(find_params).sort([ ('message.time', 1) ]).batch_size(256): LOGGER.debug(f"retry_message_job len_seen_ids={len(seen_ids)} " f"len_gtasks={len(gtasks)} len_tasks={len(tasks)}") if shared_stats is not None: shared_stats['retry_messages_job_seen_ids'] = len(seen_ids) shared_stats['retry_messages_job_gtasks'] = len(gtasks) shared_stats['retry_messages_job_tasks'] = len(tasks) shared_stats['retry_messages_job_actions'] = len(actions) shared_stats['retry_messages_job_messages_actions'] = len( messages_actions) shared_stats['retry_messages_job_i'] = i shared_stats['retry_messages_job_j'] = j if pending['message']['item_type'] == 'ipfs' or pending['message'][ 'type'] == 'STORE': i += 15 j += 100 else: i += 1 j += 1 tasks.append( asyncio.create_task( handle_pending_message(pending, seen_ids, actions, messages_actions))) if (j >= 20000): # Group tasks using asyncio.gather in `gtasks`. # await join_pending_message_tasks(tasks, actions_list=actions, messages_actions_list=messages_actions) gtasks.append( asyncio.create_task( join_pending_message_tasks( tasks, actions_list=actions, messages_actions_list=messages_actions))) tasks = [] actions = [] messages_actions = [] i = 0 j = 0 if (i >= 1024): await join_pending_message_tasks(tasks) # gtasks.append(asyncio.create_task(join_pending_message_tasks(tasks))) tasks = [] i = 0 gtasks.append( asyncio.create_task( join_pending_message_tasks( tasks, actions_list=actions, messages_actions_list=messages_actions))) await asyncio.gather(*gtasks, return_exceptions=True) gtasks = [] if await PendingMessage.collection.count_documents(find_params ) > 100000: LOGGER.info('Cleaning messages') clean_actions = [] # big collection, try to remove dups. for key, height in seen_ids.items(): clean_actions.append( DeleteMany({ 'message.item_hash': key[0], 'message.sender': key[1], 'source.chain_name': key[2], 'source.height': { '$gt': height } })) result = await PendingMessage.collection.bulk_write(clean_actions) LOGGER.info(repr(result)) await asyncio.sleep(5)
def reducer(self, key, values): """ Cleans the metering data: - gets "acumulated or instant" values - removes negative and outliers - detects gaps - generates daily dataframe :param key: the device :param values: the information :return: """ #create dataframe with the values: df = pd.DataFrame.from_records(values) # group it by source and energyType columns = [x[0] for x in self.config['output']['fields']] columns.remove("stationId") df = df.set_index('ts') df = df.sort_index() df['ts'] = df.index raw_data = df[columns].to_dict('records') for r in raw_data: r.update({"stationId": key}) ops = [InsertOne(x) for x in raw_data] result = self.mongo['meteo_raw_data'].bulk_write( [ DeleteMany( {"stationId": key}), ] + ops ) # check if duplicated meteo data duplicated_index = df.index.duplicated(keep='last') duplicated_values = df[duplicated_index].index.values.tolist() df = df[~duplicated_index] max_threshold = self.config['threshold']['max'] max_outlier_bool = dc.detect_max_threshold_outliers(df['temperature'], max_threshold) df['temperature'] = dc.clean_series(df['temperature'], max_outlier_bool) min_threshold = self.config['threshold']['min'] min_threshold_bool = dc.detect_min_threshold_outliers(df['temperature'], min_threshold) df['temperature'] = dc.clean_series(df['temperature'], min_threshold_bool) #znorm_bool = dc.detect_znorm_outliers(df['temperature'], 30, mode="global") #df['temperature'] = dc.clean_series(df['temperature'], znorm_bool) max_outliers = list(df[max_outlier_bool].index) negative_outliers = list(df[min_threshold_bool].index) #znorm_outliers = list(df[znorm_bool].index) missing_values = list(df[df.temperature.isnull()].index) clean_data = df[columns].to_dict('records') for r in clean_data: r.update({"stationId": key}) ops = [InsertOne(x) for x in clean_data] result = self.mongo['meteo_clean_data'].bulk_write( [ DeleteMany( {"stationId": key}), ] + ops ) self.mongo['meteo_data_quality'].update( {"stationId": key}, {"$set": { "overlapings": duplicated_values, "gaps": missing_values, "negative_values": negative_outliers, # "znorm_outliers": znorm_outliers, "max_outliers": max_outliers} }, upsert=True) all = [x[0] for x in self.config['output']['fields']] for row in df.iterrows(): return_list = [] for f in all: if f == "ts": return_list.append(str(row[1]['ts'].timestamp())) elif f == "stationId": return_list.append(key) else: return_list.append(str(row[1][f])) yield None, "\t".join(return_list)
def delete_many(self, cliteria): self._batch.append(DeleteMany(cliteria))
def remove(self, collection: str, col_filter: dict): if collection not in self.transactions: self.transactions[collection] = [] self.transactions[collection].append(DeleteMany(col_filter)) return Remove(self.transactions)
def delete(self): """ Delete multiple object(s) based on a filter --- parameters: - in: body description: List of filter fields and value, on basis of which the object(s) will be deleted schema: type: array items: anyOf: - schema: properties: _id: type: string - schema: properties: name: type: string - schema: properties: brand_name: type: string - schema: properties: regular_price_value: type: number format: float - schema: properties: offer_price_value: type: number format: float - schema: properties: currency: type: string - schema: properties: classification_l1: type: string - schema: properties: classification_l2: type: string - schema: properties: classification_l3: type: string - schema: properties: classification_l4: type: string - schema: properties: image_url: type: string responses: '200': description: Bulk Write result object from MongoDB content: application/json: schema: type: object properties: acknowledged: type: boolean matched_count: type: string modified_count: type: integer deleted_count: type: integer upserted_ids: type: array items: type: object properties: _id: type: string inserted_count: type: integer '500': description: Server encountered an error while performing bulk operation content: application/json: schema: type: object properties: message: type: string """ requests = [] for q in request.json: requests.append(DeleteMany(q)) return perform_bulk(collection, requests), 200