def add_identical_offers(collection_name, offer_limit, n_highest, provenance=None): collection = get_collection("mpnoffers") model = load_model_from_s3(collection_name) now = datetime.now() mongo_filter = { "validThrough": { "$gt": now }, "siteCollection": collection_name } if provenance: mongo_filter["provenance"] = provenance offers = collection.find( mongo_filter, projection=MONGO_PROJECTION, limit=offer_limit, ) offers_list = list(offers) result = list( add_identical_offers_to_batch(batch, model, collection_name, n_highest) for batch in pydash.chunk(offers_list, CHUNK_SIZE)) return result
def create_models(collection_name, offer_limit): collection = get_collection("mpnoffers") now = datetime.now() start_time = time() print(f"Fetching up to {OFFER_LIMIT} offers from db.") offers = collection.aggregate( [ { "$match": { "validThrough": { "$gt": now }, "siteCollection": collection_name } }, { "$project": MONGO_PROJECTION }, { "$limit": 2**17 }, ], allowDiskUse=True, ) offers_list = list(offers) print(f"Time spent: {time() - start_time} s.") print(f"Got {len(offers_list)} after filter") model = get_model(offers_list) result = save_model_to_s3(model, collection_name) return result
def save_scraped_products(products: Iterable, offers_collection_name: str): last_update_limit = datetime.utcnow() - timedelta( OVERWRITE_EDIT_LIMIT_DAYS) meta_fields_collection = get_collection(f"mpnoffersmeta") meta_fields = meta_fields_collection.find( dict(updatedAt={"$gt": last_update_limit})) uri_field_dict = meta_fields_result_to_dict(meta_fields) return bulk_upsert(remove_protected_fields(products, uri_field_dict), "mpnoffers", "uri")
def bulk_upsert(iterable: Iterable, collection_name: str, id_field: str = "uri"): print("Start saving to Mongo collection: {}".format(collection_name)) collection = get_collection(collection_name) requests = list(map(get_update_one, iterable)) print("{} items to write".format(len(requests))) result = collection.bulk_write(requests) return result
def save_similar_offers(updates: list): collection = get_collection("mpnoffers") requests = list([ UpdateOne( dict(uri=update["uri"]), {"$set": dict(similarOffers=update["similarOffers"])}, ) for update in updates ]) return collection.bulk_write(requests)
def add_identical_offer_relations(uris_lists: Iterable[Iterable[str]]): """ Adds offers with the same gtins to be identical.""" operations = [] now = datetime.now() for uris in uris_lists: upsert_operation1 = UpdateOne( { "relationType": "identical", "offerSet": { "$in": uris }, }, { "$setOnInsert": { "createdAt": now, "updatedAt": now, "relationType": "identical", "offerSet": uris, "selectMethod": "auto" }, }, upsert=True, ) operations.append(upsert_operation1) upsert_operation2 = UpdateOne( { "relationType": "identical", "offerSet": { "$in": uris }, }, { "$set": { "updatedAt": now }, "$addToSet": { "offerSet": { "$each": uris }, }, }, upsert=False, ) operations.append(upsert_operation2) print(f"{len(operations)} operations to add identical offers") collection = get_collection("offerbirelations") bulk_write_result = collection.bulk_write(operations, ordered=True) return bulk_write_result
def save_promoted_offers(df, collection_name: str): collection = get_collection(collection_name) requests = list([ UpdateOne( dict(uri=get_product_uri(provenances.SHOPGUN, row.id)), { "$set": dict(is_promoted=True, select_method=select_methods.AUTO) }, ) for _, row in df.iterrows() ]) return collection.bulk_write(requests)
def get_handle_configs(provenance: str): collection = get_collection("handleconfigs") result = list(x for x in collection.find({ "provenance": provenance, "status": { "$ne": "disabled" } })) if len(result) > 0: return result else: raise NoHandleConfigError( f"No handleconfig found for provenance: {provenance}.")
def get_offers_by_uris(uris): collection = get_collection("mpnoffers") return collection.find({"uri": {"$in": uris}}, MONGO_PROJECTION)
def get_offers_with_product( provenance: str, collection_name: str, target_collection_name: str, relation_collection_name: str, limit: int = 0, ) -> Iterable[dict]: collection = get_collection(collection_name) pipeline = [ # {"$match": {"provenance": provenance,}}, { "$match": { "gtins": { "$ne": None }, } }, { "$addFields": { "gtin_list": { "$objectToArray": "$gtins" }, }, }, { "$lookup": { "from": target_collection_name, # "localField": "gtin_list", # "foreignField": "gtins", "let": { "source_gtin_list": "$gtin_list" }, "pipeline": [ { "$addFields": { "gtin_list": { "$objectToArray": "$gtins" }, }, }, { "$addFields": { "same_gtins": { "$setIntersection": [ "$$source_gtin_list", "$gtin_list", ] }, }, }, { "$match": { "$expr": { "$gt": ["$same_gtins", []] } }, }, { "$project": { "_id": 1, "provenance": 1, "same_gtins": 1, } }, ], "as": "gtin_products", }, }, { "$lookup": { "from": relation_collection_name, "let": { "source_id": "$_id" }, "pipeline": [ { "$match": { "$expr": { "$eq": ["$$source_id", "$offer"] } } }, { "$project": { "_id": 1, "product": 1 } }, ], # "localField": "_id", # "foreignField": "offer", "as": "product_relations", } }, ] if limit > 0: pipeline.append({"$limit": limit}) return collection.aggregate(pipeline)
def store_handle_run(handle_run_config): collection = get_collection("handleruns") return collection.insert_one(handle_run_config)
def get_offers_same_gtin_offers( provenance: str, collection_name: str, limit: int = 0, ) -> Iterable[dict]: collection = get_collection(collection_name) now = datetime.now() pipeline = [ { "$match": { "validThrough": { "$gt": now }, "provenance": provenance, "gtins": { "$ne": {}, "$exists": True }, } }, { "$project": { "gtins": 1, "provenance": 1, "uri": 1 } }, { "$addFields": { "gtin_list": { "$objectToArray": "$gtins" }, "source_id": "$_id", }, }, { "$lookup": { "from": "mpnoffers", "let": { "source_gtin_list": "$gtin_list", "source_id": "$source_id" }, "pipeline": [ { "$match": { "gtins": { "$ne": {}, "$exists": True }, "$expr": { "$ne": ["$$source_id", "$_id"] }, }, }, { "$addFields": { "gtin_list": { "$objectToArray": "$gtins" }, }, }, { "$addFields": { "same_gtins": { "$setIntersection": [ "$$source_gtin_list", "$gtin_list", ] }, }, }, { "$match": { "same_gtins": { "$exists": True }, "$expr": { "$gt": ["$same_gtins", []] }, }, }, { "$project": { "_id": 1, "provenance": 1, "same_gtins": 1, "uri": 1, } }, ], "as": "gtin_products", }, }, { "$match": { "gtin_products": { "$not": { "$size": 0 } } } }, ] if limit > 0: pipeline.append({"$limit": limit}) return collection.aggregate(pipeline)