def get_update_requests(self, member_stats, member=None): if member: m = member else: for gid in member_stats.guild_ids: g = self.bot.get_guild(gid) m = g.get_member(member_stats.id) if m: break else: return [] items = member_stats.process_status(m.status.value, update=True) last_mark = items[-1]["mark"] reqs = [ pymongo.UpdateOne( {"user_id": m.id}, {"$pull": {"status": {"mark": {"$lt": last_mark-720}}}} ), pymongo.UpdateOne( {"user_id": m.id}, {"$push": {"status": {"$each": items}}, "$setOnInsert": {"user_id": m.id, "timezone": 0}}, upsert=True ) ] return reqs
def update_mongo(db, collection, df, host='localhost', port=27017, username=None, password=None, no_id=False): """ Read from Mongo and Store into DataFrame """ # Connect to MongoDB conn = _connect_mongo(host=host, port=port, username=username, password=password, db=db) db = conn[db] # Make a query to the specific DB and Collection updates = [] for _, row in df.iterrows(): updates.append( pymongo.UpdateOne({'_id': row.get('_id')}, {'$set': { 'filtered': row.get('filtered') }}, upsert=True)) updates.append( pymongo.UpdateOne({'_id': row.get('_id')}, {'$set': { 'applied': row.get('applied') }}, upsert=True)) db[collection].bulk_write(updates)
def find_discoverer(maxbsur): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) count = 0 operation = [] for author in cursor: count += 1 sur = author["sur"] author_id = author["_id"] if sur >= 0 and sur < maxbsur: operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "ifdis": 0 }})) else: operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "ifdis": 1 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) print("又写入并完成", len(operation)) cursor.close()
def UpdateCountrySummary(self,countryData,d): countryArr = ["Taiwan", "Korea"] for country in countryArr: if not self.CheckCountryBound(country,d): continue countryCode = "" if country == "Taiwan": countryCode = "s" elif country == "Korea": countryCode = "_krs" inc = {} area = self.LatToArea(country, d["gridY"]/self.gridPerUnit); inc[area+"Sum"] = d["pm25"] inc[area+"Num"] = d["weight"] tday = d["time"].replace(hour=0,minute=0,second=0) t10min = d["time"].replace(minute=(d["time"].minute-d["time"].minute%10),second=0) tableDaily = "sensordailysum"+countryCode if tableDaily not in countryData: countryData[tableDaily] = [] countryData[tableDaily].append(pymongo.UpdateOne({"_id":tday}, {"$inc": inc}, upsert=True)) table10min = "sensor10minsum"+countryCode if table10min not in countryData: countryData[table10min] = [] countryData[table10min].append(pymongo.UpdateOne({"_id":t10min}, {"$inc": inc}, upsert=True)) return countryData
def update_array(path, change_list): """Updates an array of embedded documents. Return 0 on success, 1 if connection was lost.""" write_operations = [] # Return blank list if there are no changes at this path if not change_list: return write_operations # Remove documents to be updated write_operations.append( pymongo.UpdateOne({'tba_event_key': utils.TBA_EVENT_KEY}, {'$pull': { path: { '$or': change_list } }})) # Select documents to add filter_change_list = [] for change in change_list: equals = [] for key, value in change.items(): equals.append({'$eq': [f'$$item.{key}', value]}) filter_change_list.append({'$and': equals}) to_add = local_database_communicator.DB.competitions.aggregate([{ '$match': { 'tba_event_key': utils.TBA_EVENT_KEY } }, { '$project': { path: { '$filter': { 'input': f'${path}', 'as': 'item', 'cond': { '$or': filter_change_list } } } } }]) # Aggregate returns a cursor object, so it must be converted to a list. `tba_event_key` is # guaranteed to be unique, so there will always one and only one result. to_add = list(to_add)[0] # Remove `_id` so so the only item is the array nested in the directory structure to_add.pop('_id') # Remove nesting, making `to_add` only a list of changed documents while isinstance(to_add, dict): to_add = to_add[[*to_add.keys()][0]] # No data matched or dataset does not exist, so warn & return blank list if to_add is None: utils.log_warning(f'No the dataset at {path} does not exist.') return [] write_operations.append( pymongo.UpdateOne({'tba_event_key': utils.TBA_EVENT_KEY}, {'$push': { path: { '$each': to_add } }})) return write_operations
def _do_write(self, collection, version, symbol, item, previous_version, segment_offset=0): sze = int(item.dtype.itemsize * np.prod(item.shape[1:])) # chunk and store the data by (uncompressed) size chunk_size = int(_CHUNK_SIZE / sze) previous_shas = [] if previous_version: previous_shas = set([Binary(x['sha']) for x in collection.find({'symbol': symbol}, projection={'sha': 1, '_id': 0}, ) ]) length = len(item) if segment_offset > 0 and 'segment_index' in previous_version: existing_index = previous_version['segment_index'] else: existing_index = None segment_index = [] i = -1 # Compress idxs = xrange(int(np.ceil(float(length) / chunk_size))) chunks = [(item[i * chunk_size: (i + 1) * chunk_size]).tostring() for i in idxs] compressed_chunks = compress_array(chunks) # Write bulk = [] for i, chunk in zip(idxs, compressed_chunks): segment = {'data': Binary(chunk), 'compressed': True} segment['segment'] = min((i + 1) * chunk_size - 1, length - 1) + segment_offset segment_index.append(segment['segment']) sha = checksum(symbol, segment) if sha not in previous_shas: segment['sha'] = sha bulk.append(pymongo.UpdateOne({'symbol': symbol, 'sha': sha, 'segment': segment['segment']}, {'$set': segment, '$addToSet': {'parent': version['_id']}}, upsert=True)) else: bulk.append(pymongo.UpdateOne({'symbol': symbol, 'sha': sha, 'segment': segment['segment']}, {'$addToSet': {'parent': version['_id']}})) if i != -1: collection.bulk_write(bulk, ordered=False) segment_index = self._segment_index(item, existing_index=existing_index, start=segment_offset, new_segments=segment_index) if segment_index: version['segment_index'] = segment_index version['segment_count'] = i + 1 version['append_size'] = 0 version['append_count'] = 0 self.check_written(collection, symbol, version)
def fmt(x): if x['op'] == 'i': op = x['o'] return pymongo.UpdateOne({'_id': op['_id']}, {'$set': op}, upsert=True) elif x['op'] == 'u': return pymongo.UpdateOne(x['o2'], x['o'], upsert=True) elif x['op'] == 'd': return pymongo.DeleteMany(x['o'])
def toptrend(thistime): tweetTrends = [] updateTrend = [] removeTrend = [] dataB = db.retweet_permin_data.aggregate([ { "$match": { "timeUpdate": { "$gte": thistime - dt.timedelta(minutes=180) } } }, { "$group": { "_id": "$id_str", "retweetNow": { "$sum": { "$cond": [{ "$gte": [ "$timeUpdate", thistime - dt.timedelta(minutes=180) ] }, "$retweet", 0] } }, } }, ]) for item in dataB: tweetTrends.append({'id': item['_id'], 'retweet': item['retweetNow']}) tweetTrends.sort(key=sort_by_retweet, reverse=True) dataA = db.master_data.find({'trend': {"$gte": 1}}) for i in range(10): print(tweetTrends[i]['id']) print(tweetTrends[i]['retweet']) for i in dataA: removeTrend.append((pymongo.UpdateOne({'id_str': i['id_str']}, {'$set': { "trend": 0 }}, upsert=True))) if (len(removeTrend) > 0): db.master_data.bulk_write(removeTrend, ordered=False) for i in range(10): updateTrend.append( (pymongo.UpdateOne({'id_str': tweetTrends[i]['id']}, { '$set': { "trend": i + 1, "retweet_30min": tweetTrends[i]['retweet'] } }, upsert=True))) if (len(updateTrend) > 0): db.master_data.bulk_write(updateTrend, ordered=False)
def getTopTrendInOneDay(thistime): tweetTrends = [] updateTrend = [] removeTrend = [] dataB = db.retweet_permin_data.aggregate([ { "$match": { "timeUpdate": { "$gte": dt.datetime(thistime.year, thistime.month, thistime.day, 0, 0, 0, 0) } } }, { "$group": { "_id": "$id_str", "retweetNow": { "$sum": { "$cond": [{ "$gte": [ "$timeUpdate", dt.datetime(thistime.year, thistime.month, thistime.day, 0, 0, 0, 0) ] }, "$retweet", 0] } }, } }, ]) for item in dataB: tweetTrends.append({'id': item['_id'], 'retweet': item['retweetNow']}) tweetTrends.sort(key=sort_by_retweet, reverse=True) dataA = db.master_data.find({'trendInDay': {"$gte": 1}}) for i in dataA: removeTrend.append((pymongo.UpdateOne({'id_str': i['id_str']}, {'$set': { "trendInDay": 0 }}, upsert=True))) if (len(removeTrend) > 0): db.master_data.bulk_write(removeTrend, ordered=False) for i in range(len(tweetTrends)): updateTrend.append( (pymongo.UpdateOne({'id_str': tweetTrends[i]['id']}, { '$set': { "trendInDay": i + 1, "retweet_1Day": tweetTrends[i]['retweet'] } }, upsert=True))) if (len(updateTrend) > 0): db.master_data.bulk_write(updateTrend, ordered=False)
def save(self, data): """ Сохранение цепи Маркова в базу данных :param data: модель цепи Маркова :type data: dict """ # TODO: end_symbol is not enough add some start ones # Creating indexes self.model.create_index([('key', pymongo.ASCENDING)], name='keys', unique=True) res = list() items = data.items() ln = len(items) batch_size = 1e5 for i, (key, value) in enumerate(data.items()): print('saving: {}/{}'.format(i + 1, ln)) start = key[0] == self.tokenizer.end_symbol key = ' '.join(map(str, key)) value = { str(value_key): value_value for value_key, value_value in value.items() } increments = { 'value.{}'.format(k): val for k, val in value.items() } if start: res.append( pymongo.UpdateOne({'key': key}, { '$inc': increments, '$set': { 'start': start } }, upsert=True)) else: res.append( pymongo.UpdateOne({'key': key}, { '$inc': increments, '$setOnInsert': { 'start': start } }, upsert=True)) if (i + 1) % batch_size == 0: self.model.bulk_write(res, ordered=False) res = list() if len(res) > 0: self.model.bulk_write(res, ordered=False)
def update(self, data): """store data to storage Args: data (list): store data Returns: bool: updated or not """ operations = [ pymongo.UpdateOne({'id': row['id']}, { '$set': { 'chapter': row['chapter'], 'is_read': row['is_read'] } }, upsert=True) for id, row in data.items() ] try: self.collection.bulk_write(operations) return True except pymongo.errors.BulkWriteError as e: print(e) print(traceback.format_exc()) return False
def migrate_tasks(collection, requests): query = { "$or": [{ "time.completed": { "$ne": 0 } }, { "time.cancelled": { "$ne": 0 } }, { "time.failed": { "$ne": 0 } }] } ttl = CONF["cron"]["clean_finished_tasks_after_seconds"] ttl = datetime.timedelta(seconds=ttl) for item in collection.find(query, projection=["_id", "time"]): expired_at = max(item["time"].values()) expired_at = datetime.datetime.utcfromtimestamp(expired_at) + ttl request = pymongo.UpdateOne({"_id": item["_id"]}, {"$set": { task.TTL_FIELDNAME: expired_at }}) requests.append(request)
def process_run_batch(collection, records): updates = [] for record in records: run_name = record["_id"] if ".run" in run_name: # obsolete format: experiment.run_name _, run_name = run_name.split(".", 1) if not run_name or not run_name.startswith("run"): # unrecognized format, just group all of these a -1 run_num = -1 else: base = run_name[3:] if "." in base: parent, child = base.split(".") # allow for 1 million run_num = 1000*1000*int(parent) + int(child) else: run_num = 1000*1000*int(base) fd = {"_id": record["_id"]} ud = {"$set": {"run_num": run_num}} update = pymongo.UpdateOne(fd, ud) updates.append(update) # write batch if len(updates): collection.bulk_write(updates)
def boot_strap(P_d): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) count = 0 operation = [] for author in cursor: count += 1 coauthor_times = author["new_con"] author_id = author["_id"] d_i_list = np.random.binomial(coauthor_times, P_d, 20) surprisal_list = [] for di in d_i_list: P0 = stats.binom.sf(di - 1, coauthor_times, P_d) surprisal_list.append(-math.log(P0)) S = np.mean(surprisal_list) operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "bsur": S }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) print("又写入并完成", len(operation)) cursor.close() print(col_author.count_documents({"sur": -6})) print(col_author.count_documents({"dn": -1})) print(col_author.count_documents({"bsur": -6}))
def initialize_surprisal(): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) # researcher_number = cursor.count() # print(researcher_number) count = 0 operation = [] for author in cursor: count += 1 operation.append( pymongo.UpdateOne({"_id": author["_id"]}, {"$set": { "sur": -6, "bsur": -6 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col_author.bulk_write(operation, ordered=False) print("finished") cursor.close() print(count) print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
def update_ink_embeddings(db, ink_embeddings): bulk_ops = list() for data in ink_embeddings: bulk_ops.append(pymongo.UpdateOne({"_id": data['_id']}, {'$set': {'sif_embeddings': data['sif_embeddings']}})) n_modified = db.inktalks.bulk_write(bulk_ops, ordered=False).bulk_api_result['nModified'] return n_modified
def researchers_con(): ''' the coauthor times based on the mag_authors0510 :return: ''' col1 = connectTable('qiuzh', "mag_authors0510") col2 = connectTable('qiuzh', "mag_researchers0707") count = 0 operation = [] cursor = col2.find(no_cursor_timeout=True) for author in cursor: count += 1 author_id = author["_id"] coauthor_number = col1.find_one({"_id": author_id})["con"] operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "con": coauthor_number }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col2.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col2.bulk_write(operation, ordered=False) print("又处理", len(operation)) cursor.close()
def _mongo_main_add_batch(_mongo, source_id, batch, max_attempts=3): unique_accounts = 0 attempts_left = int(max_attempts) mongo_batch = [] error_details = '' for account_doc in batch: _id = account_doc.pop('_id') mongo_batch.append( pymongo.UpdateOne({'_id': _id}, {'$setOnInsert': account_doc}, upsert=True)) while attempts_left > 0: try: result = _mongo.accounts.bulk_write(mongo_batch, ordered=False) unique_accounts = result.upserted_count return unique_accounts # sleep for a bit and try again if there's an error except (pymongo.errors.OperationFailure, pymongo.errors.InvalidOperation) as e: try: error_details = str(e.details)[:128] except AttributeError: pass attempts_left -= 1 sleep(5) continue raise pymongo.errors.PyMongoError( 'Failed to add batch to main DB after {} tries'.format( max_attempts))
def _update_object(cls, data, meta=None, fmt=None, state=None, bulk=None) -> bool: def is_changed(d, h): return not d or d.get(cls.F_HASH) != h obj_id = cls.clean_id(data["id"]) if meta is None and "$meta" in data: meta = data.pop("$meta") m_name = "%s_%s" % (cls.name, fmt) if fmt else cls.name l_name = "%s|%s|%s" % (cls.name, obj_id, fmt) if fmt else "%s|%s" % (cls.name, obj_id) metrics["ds_%s_updated" % m_name] += 1 # Calculate hash hash = cls.get_hash(data) # Get existing object state if state: doc = state.get(obj_id) else: doc = cls.get_collection(fmt).find_one({cls.F_ID: obj_id}, { cls.F_ID: 0, cls.F_HASH: 1 }) if not is_changed(doc, hash): logger.info("[%s] Object hasn't been changed", l_name) return False # Not changed if not fmt and cls.on_change(data): hash = cls.get_hash(data) if not is_changed(doc, hash): logger.info("[%s] Object hasn't been changed", l_name) return False # Not changed after altering metrics["ds_%s_changed" % m_name] += 1 change_id = bson.ObjectId() data["change_id"] = str(change_id) op = { "$set": { cls.F_CHANGEID: change_id, cls.F_HASH: hash, cls.F_DATA: smart_text(orjson.dumps(data)), } } if meta: op["$set"][cls.F_META] = meta elif "$deleted" not in data: op["$unset"] = {cls.F_META: ""} if bulk is None: cls.get_collection(fmt).update_one({cls.F_ID: obj_id}, op, upsert=True) else: bulk += [pymongo.UpdateOne({cls.F_ID: obj_id}, op, upsert=True)] logger.info("[%s] Object has been changed", l_name) if cls.enable_message: # Build MX message logger.info("[%s] Sending message", l_name) cls.send_message(data, change_id) return True
def get_messages(client, token): updates = [] i = 0 for issue in client.codereview.issues.find({ "messages": { "$exists": False } }).sort([("modified", pymongo.DESCENDING)]): print(issue["issue"]) params = {"messages": True, "format": "json"} r = requests.get("https://mongodbcr.appspot.com/api/" + str(issue["issue"]), params=params, headers=dict(Authorization="OAuth " + token)) print(r.status_code) if (r.status_code == 404): # The patch set was probably deleted somehow? print("Skipping patch set with a 404") continue if (r.status_code != 200): print(r.text) exit(1) updates.append( pymongo.UpdateOne({"_id": issue["_id"]}, {"$set": { "messages": r.json()["messages"] }})) i += 1 if (i % 1000 == 0): client.codereview.issues.bulk_write(updates) updates = [] if len(updates) > 0: client.codereview.issues.bulk_write(updates)
def check_new_messages(self): """Проверка новых сообщений.""" current_user = self._get_user_data() query = { 'receiver_id': current_user._id, 'seen': False, } messages = list(self._db.messages.find(query)) msg_ids = [] messages_list = [] if len(messages) == 0: print(_('You have not any new messages')) return for msg in messages: sender = self._db.users.find_one( { '_id': msg['sender_id'] } ) sender_login = sender['login'] text = msg['text'] ts = msg['ts'] print(f'{sender_login} at {ts}: {text}') messages_list.append(f'{sender_login} at @@@: {text}') msg_ids.append(msg['_id']) bulk_query = [ pymongo.UpdateOne({'_id': msg_id}, {'$set': {'seen': True}}) for msg_id in msg_ids ] self._db.messages.bulk_write(bulk_query) return messages_list
def write(self, to_sink, data, index=None): coll_curr = self.bili_db[to_sink] # 将数据转换为 [{'a': 1}, {'a': 2}] 格式 if type(data) == pd.DataFrame: data = data.to_dict(orient='records') elif type(data) == dict: data = [data] elif data is None: print("数据为空, 无法写入") return # json_data = json.loads(data.to_json(orient='records', lines=False)) if index is None or index == "": coll_curr.insert_many(data) return def_filter = None if type(index) == str: def_filter = lambda item: {index: item[index]} elif type(index) == list: def_filter = lambda item: {i: item[i] for i in index} if len(data) == 0: print("sink:{}, 输入数据集为空".format(to_sink)) return bulkWriteResult = coll_curr.bulk_write([ pymongo.UpdateOne(def_filter(item), {"$set": item}, upsert=True) for item in data ]) print("sink:{}, 匹配{}条数据".format(to_sink, bulkWriteResult.matched_count)) print("sink:{}, 写入{}条数据".format(to_sink, bulkWriteResult.upserted_count)) print("sink:{}, 修改{}条数据".format(to_sink, bulkWriteResult.modified_count))
def update_mongo_compound_variants(self, bulk): """Update the compound information for a bulk of variants in the database Args: bulk(dict): {'_id': scout.models.Variant} """ requests = [] for var_id in bulk: var_obj = bulk[var_id] if not var_obj.get("compounds"): continue # Add a request to update compounds operation = pymongo.UpdateOne( {"_id": var_obj["_id"]}, {"$set": { "compounds": var_obj["compounds"] }}) requests.append(operation) if not requests: return try: self.variant_collection.bulk_write(requests, ordered=False) except BulkWriteError as err: LOG.warning("Updating compounds failed") raise err
def initialize_discover_number(): ''' this function is used in 2021.8.12 in mag_researchers0810 in 2021.9.1 we used this function in researchers0810_trainingset :return: ''' col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) # researcher_number = cursor.count() # print(researcher_number) count = 0 operation = [] for author in cursor: count += 1 operation.append( pymongo.UpdateOne({"_id": author["_id"]}, {"$set": { "dn": -1 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col_author.bulk_write(operation, ordered=False) print("finished") cursor.close() print(count) print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
def _getBulkUpsertOperations(arrays, simple): if simple: return list( map( lambda item: pymongo.UpdateOne(item, {'$set': item}, upsert=True), arrays)) return list(map(_transformUpdateOne, arrays))
def _mongo_meta_add_batch(_mongo, source_id, batch, max_attempts=3): attempts_left = int(max_attempts) mongo_tags_batch = [] for account_doc in batch: _id = account_doc['_id'] mongo_tags_batch.append( pymongo.UpdateOne({'_id': _id}, {'$addToSet': { 's': source_id }}, upsert=True)) while attempts_left > 0: try: _mongo.account_tags.bulk_write(mongo_tags_batch, ordered=False) return # sleep for a bit and try again if there's an error except (pymongo.errors.OperationFailure, pymongo.errors.InvalidOperation) as e: #errprint('\n[!] Error adding account batch to meta DB. Attempting to continue.\n{}'.format(str(e)[:64])) #try: # errprint(str(e.details)[:64]) #except AttributeError: # pass attempts_left -= 1 sleep(5) continue
def set_string_field_to_datetime(self, self_col: Collection = None, field_tag='TimeStamp'): if (self_col is not None) and (self_col != self._collection): self.collection = self_col logger.info("MongoInterface now using Collection: '{%s}'", self.collection.name) # find docs with field tags that are not of datetime format query = {field_tag: {'$not': {'$type': "date"}}} proj = {field_tag: 1} with self.collection.find(query, proj) as cursor: bulk_requests = [] for doc in cursor: timestamp = parse_datetime(doc[field_tag]) bulk_requests.append( pym.UpdateOne({'_id': doc['_id']}, {'$set': { field_tag: timestamp }})) if bulk_requests: result = self.collection.bulk_write(bulk_requests).modified_count else: result = 0 logger.info("Modified TimeStamp type for %s documents", result) return result
def paper_citation_number(begin, end, msg): ''' this function is appropriate for citation_network0515 and mag_papers0510 :return: add each papers' total citation in mag_papers0510 ''' colpaper = connectTable("qiuzh", "mag_papers0510") col_citation_network = connectTable("qiuzh", "citation_network0810_trainingset") count = 0 operation = [] cursor = colpaper.find(no_cursor_timeout=True)[begin:end] for paper in cursor: count += 1 paper_id = paper["_id"] citation_number = 0 paper_citation_relations = col_citation_network.find( {"id": paper_id}, no_cursor_timeout=True) if paper_citation_relations: for paper_citation_relation in paper_citation_relations: citation_number += len(paper_citation_relation["citation"]) operation.append( pymongo.UpdateOne({"_id": paper_id}, {"$set": { "cn_before1996": citation_number }})) if count % 10000 == 0: print(msg, "已处理:", count / 10000, flush=True) colpaper.bulk_write(operation, ordered=False) print(msg, "已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: colpaper.bulk_write(operation, ordered=False) cursor.close()
def put_bulk(self, payload_list, selector_key, priority=0): """Put list of task into profiles queue :param payload: payload to save into the qeue :param priority: the bigger the better :param selector: key-value pair or more complex query to check if item already in queue :returns: `InsertOneResult` """ ops = [] for payload in payload_list: payload_normalized = self._payload_validator.normalized(payload) payload_key = 'payload.{}'.format(selector_key) op = pymongo.UpdateOne( { payload_key: payload_normalized[selector_key]}, {'$set': payload_normalized}, upsert=True, ) v = self._payload_validator.validate(payload) if v is False: raise PayloadValidationError( "Vaidation_errors: {}".format( self._payload_validator.errors)) else: ops.append(op) res = self.col.bulk_write(ops) return res
def new_pub_count(begin, end, msg): col_author = connectTable("qiuzh", "researchers0810_trainingset") count = 0 operation = [] cursor = col_author.find(no_cursor_timeout=True)[begin:end] for author in cursor: count += 1 author_id = author["_id"] pub_count = 0 for paper in author["new_pubs"]: if paper["year"] <= 1996: pub_count += 1 operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "pub_count": pub_count }})) if count % 10000 == 0: print(msg, "已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print(msg, "已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) cursor.close()