def construct_schema_pymongo(
    collection: pymongo.collection.Collection,
    delimiter: str,
    sample_size: Optional[int] = None,
) -> Dict[Tuple[str, ...], SchemaDescription]:
    """
    Calls construct_schema on a PyMongo collection.

    Returned schema is keyed by tuples of nested field names, with each
    value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes.

    Parameters
    ----------
        collection:
            the PyMongo collection
        delimiter:
            string to concatenate field names by
        sample_size:
            number of items in the collection to sample
            (reads entire collection if not provided)
    """

    if sample_size:
        # get sample documents in collection
        documents = collection.aggregate(
            [{"$sample": {"size": sample_size}}], allowDiskUse=True
        )
    else:
        # if sample_size is not provided, just take all items in the collection
        documents = collection.find({})

    return construct_schema(list(documents), delimiter)
Beispiel #2
0
def get_user_contracts(coll: pymongo.collection.Collection,
                       user_id: ObjectId,
                       sort_by="valid_from"):
    assert isinstance(user_id, ObjectId)

    cursor = coll.aggregate([
        {
            "$match": {
                "_id": user_id
            }
        },
        {
            "$unwind": "$contracts"
        },
        {
            "$match": {
                "contracts.type": "dpp"
            }
        },  # TODO udělat to obecně ne jen pro dpp
        {
            "$sort": {
                f"contracts.{sort_by}": -1
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "contracts": {
                    "$push": "$contracts"
                }
            }
        }
    ])
    return next(cursor, {}).get("contracts", [])
Beispiel #3
0
def find_flights_with_low_prices(threshold: int, search_date: str,
                                 collection: pymongo.collection.Collection,
                                 logger: logging.Logger) -> None:
    """
    Finds flights with price lower than a threshold and returns all info for such flights
    (along with link to order tickets).
    """
    stage_name = "GET_MIN_PRICE"

    # calculate price for each itinerary (can have several Legs)
    price_per_flight_pipeline = [{
        "$match": {
            "Query.OutboundDate": search_date
        }
    }, {
        "$unwind": "$Itineraries"
    }, {
        "$project": {
            "Itineraries.OutboundLegId": {
                "$arrayToObject": [[{
                    "k": "$Itineraries.OutboundLegId",
                    "v": {
                        "$reduce": {
                            "input": "$Itineraries.PricingOptions",
                            "initialValue": 0,
                            "in": {
                                "$add": ["$$value", "$$this.Price"]
                            }
                        }
                    }
                }]]
            }
        }
    }]
    price_per_flight_results = collection.aggregate(price_per_flight_pipeline)

    # find flights with prices < threshold
    flights_with_low_prices = []
    for price_per_flight in price_per_flight_results:
        for k, v in price_per_flight['Itineraries']['OutboundLegId'].items():
            if v < threshold:
                flights_with_low_prices.append(k)

    logger.info(
        f"{stage_name} - Found {len(flights_with_low_prices)} flights with prices lower than {threshold}"
    )

    # return all flight data for resulted flights
    flights_data_pipeline = []  # TODO - request flights data from MongoDB
Beispiel #4
0
def aggregate(
    collection: pymongo.collection.Collection,
    aggregation: Iterable[Dict[Text, Any]],
) -> pymongo.command_cursor.CommandCursor:
    return collection.aggregate(list(aggregation), allowDiskUse=True)
Beispiel #5
0
def construct_schema_pymongo(
    collection: pymongo.collection.Collection,
    delimiter: str,
    use_random_sampling: bool,
    max_document_size: int,
    is_version_gte_4_4: bool,
    sample_size: Optional[int] = None,
) -> Dict[Tuple[str, ...], SchemaDescription]:
    """
    Calls construct_schema on a PyMongo collection.

    Returned schema is keyed by tuples of nested field names, with each
    value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes.

    Parameters
    ----------
        collection:
            the PyMongo collection
        delimiter:
            string to concatenate field names by
        sample_size:
            number of items in the collection to sample
            (reads entire collection if not provided)
        max_document_size:
            maximum size of the document that will be considered for generating the schema.
    """

    doc_size_field = "temporary_doc_size_field"
    aggregations: List[Dict] = []
    if is_version_gte_4_4:
        # create a temporary field to store the size of the document. filter on it and then remove it.
        aggregations = [
            {
                "$addFields": {
                    doc_size_field: {
                        "$bsonSize": "$$ROOT"
                    }
                }
            },
            {
                "$match": {
                    doc_size_field: {
                        "$lt": max_document_size
                    }
                }
            },
            {
                "$project": {
                    doc_size_field: 0
                }
            },
        ]
    if use_random_sampling:
        # get sample documents in collection
        aggregations.append({"$sample": {"size": sample_size}})
        documents = collection.aggregate(
            aggregations,
            allowDiskUse=True,
        )
    else:
        aggregations.append({"$limit": sample_size})
        documents = collection.aggregate(aggregations, allowDiskUse=True)

    return construct_schema(list(documents), delimiter)
Beispiel #6
0
def getEarliestUpdateTime(c: pymongo.collection.Collection) -> datetime.datetime:
    result = c.aggregate([{'$group': {'_id': 'all', 'firstInsert': {'$min': '$updated'}}}]).next()
    print(result)
    return result['firstInsert']