Beispiel #1
0
def insert_doc(doc: dict, col: pymongo.collection.Collection):
    if_insert = True

    if doc["danmaku"]:
        for each in col.find({"danmaku": doc["danmaku"]}):
            if_insert = False
            break
    elif doc["uid"]:
        for each in col.find({"uid": doc["uid"]}):
            if_insert = False
            break

    if if_insert:
        col.insert_one(doc)
        print("insert: " + str(doc))
def construct_schema_pymongo(
    collection: pymongo.collection.Collection,
    delimiter: str,
    sample_size: Optional[int] = None,
) -> Dict[Tuple[str, ...], SchemaDescription]:
    """
    Calls construct_schema on a PyMongo collection.

    Returned schema is keyed by tuples of nested field names, with each
    value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes.

    Parameters
    ----------
        collection:
            the PyMongo collection
        delimiter:
            string to concatenate field names by
        sample_size:
            number of items in the collection to sample
            (reads entire collection if not provided)
    """

    if sample_size:
        # get sample documents in collection
        documents = collection.aggregate(
            [{"$sample": {"size": sample_size}}], allowDiskUse=True
        )
    else:
        # if sample_size is not provided, just take all items in the collection
        documents = collection.find({})

    return construct_schema(list(documents), delimiter)
Beispiel #3
0
def download_neighbors(
    collection:pymongo.collection.Collection,
    source:str,
    limit:int=0,
)->List[Tuple[str, float]]:
  """
  Returns the neighbors of a given node as queried from the graph. If too many
  neighbors exist, we can take a random selection based on limit. If limit is
  0, we take all.
  """
  query_res =  collection.find(
      {"source": source},
      projection={
        "target": 1,
        "weight": 1,
        "_id": 0,
      },
      limit=limit
  )
  res = []
  for val in query_res:
    try:
      res.append((str(val["target"]), float(val["weight"])))
    except ValueError:
      print()
      print(f"Node {source} has an invalid edge: {val}")
  return res
Beispiel #4
0
def get_all_items(collection: pymongo.collection.Collection,
                  model: Type[BaseModel],
                  *,
                  page: int = 1,
                  num_per_page: int = 20,
                  query: dict = None,
                  projection: dict = None):
    """
    Retrieve all items from a collection

    :param collection: Collection to query
    :param model: Class which the JSON in the collection represents
    :param page: Page number to retrieve.  #ToDo: implement correct server-side pagination
    :param num_per_page: Number of items per page to retrieve. Defaults to 20.
    :param query: Return only objects that contain the query
    :param projection: Filter to exclude keys from each result
    :return: List of objects in the collection
    """

    projection = {} if projection is None else projection
    projection.update(ignore_mongo_id)

    collection_json = list(collection.find(filter=query, projection=projection)
                           .skip((page - 1) * num_per_page)
                           .limit(num_per_page))

    return [model(**item_json) for item_json in collection_json]
Beispiel #5
0
def get_all(collection: pymongo.collection.Collection) -> List[str]:
    instruments: List[str] = list()
    cursor = collection.find({})
    for document in cursor:
        instruments.append(document['symbole'])

    return instruments
Beispiel #6
0
def select_documents_after(
        refobj: ObjectId,
        collection: pymongo.collection.Collection) -> pymongo.cursor.Cursor:
    time_filter = {"_id": {"$gt": refobj}}
    cursor = collection.find(time_filter).sort('date').limit(
        MAX_DOCUMENT_LOADED)

    return cursor
Beispiel #7
0
def _get_last_datetime(collection: pymongo.collection.Collection):
    last_datetime = collection.find(filter={},
                                    projection={
                                        'datetime': -1,
                                        '_id': 0
                                    },
                                    sort=[('datetime', -1)],
                                    limit=1)

    return list(last_datetime)
Beispiel #8
0
 def __extract_fields_from_collection(mongo_collection: pymongo.collection.Collection,
                                      field_names: list) -> pymongo.cursor.Cursor:
     """
     Extract only given field names from collection items
     :param mongo_collection: Collection from which data will be extracted
     :param field_names: Name of fields which will be extracted from collection item
     :return: Cursor which refer to given collection
     """
     field_names_dict = {key: True for key in field_names}
     extracted_fields_cursor = mongo_collection.find({}, field_names_dict)
     return extracted_fields_cursor
Beispiel #9
0
def find_many(col: pymongo.collection.Collection,
              filter_dict: dict,
              sort_by: str = None,
              reverse: bool = False,
              limit: int = Constants.FIND_LIMIT,
              offset: int = 0,
              include_id: bool = False) -> list:
    result = col.find(filter_dict, {"_id": 1 if include_id else 0})
    if sort_by is not None:
        result = result.sort(sort_by, pymongo.DESCENDING if reverse else pymongo.ASCENDING)
    result = result.skip(offset).limit(limit)
    return list(result)
Beispiel #10
0
def get_memes_as_entries(
        memes_collection: pymongo.collection.Collection) -> list:
    """Get all the memes from the database and return it as a printable list of entries."""
    memes = memes_collection.find().sort("name")
    memes_list = []
    for item in memes:
        name = item["name"]
        description = item["description"]
        times_used = item["times_used"]
        meme_entry = f"- {name} | {description} | times used: {times_used}"
        memes_list.append(meme_entry)

    return memes_list
Beispiel #11
0
    def fix_all_descriptions(self, collection: pymongo.collection.Collection):
        """One time function to fix all descriptions

        Args:
            collection (pymongo.collection.Collection): MongoDB collection to be updated
        """
        offers = collection.find()
        for offer in tqdm.tqdm(offers, desc="Fixing the descriptions"):
            try:
                description_details = self.parse_single_description(offer)
                collection.update_one({"id": offer['id']},
                                      {"$set": description_details})
            except IndexError:
                print("ERROR")
Beispiel #12
0
def mongodb_get_array(
    coll: pymongo.collection.Collection,
    meta_id: ObjectId,
    name: str,
    chunk: Optional[Tuple[int, ...]],
) -> np.ndarray:
    """Load all MongoDB documents making up a dask chunk and assemble them into
    an array
    """
    find_key = {"meta_id": meta_id, "name": name, "chunk": chunk}
    docs = list(
        coll.find(find_key, {
            "dtype": 1,
            "shape": 1,
            "data": 1
        }).sort("n"))
    return docs_to_array(docs, find_key)
Beispiel #13
0
def build_users(database: pymongo.collection.Collection):
    users = dict()

    cursor = database.find()  # get all documents
    counter = 0
    for document in cursor:
        current_user_id = get_id_from_document(document)
        if current_user_id not in users:
            users[current_user_id] = PersonFactory.create(document)
        users[current_user_id].update(document)
        counter += 1
        if counter > 2000:
            break
        if counter % 1000 == 0:
            print(counter)

    return users.values()
def get_difference(collection: pymongo.collection.Collection, field: str,
                   values: pd.Series) -> pd.Series:
    """

    Parameters
    ----------
    collection
    field
    values

    Returns
    -------

    """
    cursor = collection.find({field: {"$in": values}})
    intersection = [document.get(field) for document in cursor]
    difference = set(values) - set(intersection)

    return pd.Series(list(difference))
def update_users_client_metrics(user_collection: pymongo.collection.Collection,
                                from_date: str,
                                to_date: str,
                                dry_run: bool = True) -> None:
    """Update user data with client-side metrics from Amplitude."""

    users = user_collection.find(
        {
            'registeredAt': {
                '$gt': from_date,
                '$lt': to_date
            },
            'clientMetrics.amplitudeId': {
                '$nin': [_AMPLITUDE_ID_NOT_FOUND, 'REDACTED']
            },
            'clientMetrics.isFirstSessionMobile': {
                '$exists': False
            },
        },
        projection={
            '_id': 1,
            'clientMetrics': 1
        })
    num_users_updated = 0
    for user in users:
        try:
            _update_user_client_metric(user_collection, user, dry_run)
        except TooManyRequestsException:
            # The API is limited to 360 requests, so if we manage to get 200
            # users it's expected to get an error here: no need to warn Sentry.
            if num_users_updated > 200:
                logging.info('Too many requests after updating %d users',
                             num_users_updated)
                return
            raise
        num_users_updated += 1
    def exploreChunks(self,
                      ilon_chunk: int,
                      ilat_chunk: int,
                      delta: int,
                      mask_query: Union[dict, None],
                      retrn: str,
                      col_grid: pymongo.collection.Collection)\
            -> Union[dict, pymongo.cursor.Cursor]:
        '''
        Explore an xarray chunk and returns either the number
        of grid cells or the grid ids.

        Parameters
        ----------
        ilon_chunk : int
            Longitude of the upper-left bounding box corner.
        ilon_chunk : int
            Latitude of the upper-left bounding box corner.
        delta : int
            Width and height of the bounding box (in degrees).
        mask_query : Union[dict, None]
            If all grid cells needs to be considered, set mask_query=None.
            If only certain grid cells needs to be considered, filter
            with this query. Example: for only land grid cells
            (i.e., excluding oceans) mask_query = {'lsm': {'gt': 0.6}}.
            Land-sea mask (lsm) has fractional values in the range
            0 (sea) to 1 (land).
        retrn : str
            What to return:
            * either 'ndocs' for the number or grid cells inside the chunk
            * or 'docs' for the ids of the grid cells inside the chunk.
        col_grid : pymongo.collection.Collection
            Mongo connection to the grid collection.

        Returns
        -------
        Union[dict, pymongo.cursor.Cursor]
        '''
        ilon_orig = int(ilon_chunk)
        ilon_chunk = int(self._shiftlon(x=ilon_chunk))
        ilon_plus = int(self._shiftlon(x=ilon_chunk + delta))
        ilat_chunk = int(ilat_chunk)
        geoqry = {
            'loc': {
                '$geoWithin': {
                    '$geometry': {
                        'type':
                        'Polygon',
                        'coordinates': [[[ilon_chunk, ilat_chunk],
                                         [ilon_plus, ilat_chunk],
                                         [ilon_plus, ilat_chunk + delta],
                                         [ilon_chunk, ilat_chunk + delta],
                                         [ilon_chunk, ilat_chunk]]]
                    }
                }
            }
        }
        if mask_query is not None:
            geoqry.update(mask_query)
        if retrn == 'ndocs':
            # How many grid cells in this chunk ?
            res = {
                'ilon_chunk': ilon_orig,
                'ilat_chunk': ilat_chunk,
                'n': col_grid.count_documents(filter=geoqry)
            }
        elif retrn == 'docs':
            res = col_grid.find(geoqry, {'id_grid': 1, 'loc': 1, '_id': 0})
        return (res)
Beispiel #17
0
def find(
    query: Dict[Text, Any],
    collection: pymongo.collection.Collection,
) -> pymongo.cursor.Cursor:
    return collection.find(query)
Beispiel #18
0
    def convert_currencies(self, rates: RatesConverter,
                           collection: pymongo.collection.Collection):
        """Maps the wages to PLN currencies for all offers in DB - we run it on all offers to adjust for currency fluctuation

        Args:
            rates (RatesConverter): RatesConverter instance to convert
            collection (pymongo.collection.Collection): MongoDB collection to be updated
        """
        def map_single_salary_field(salary: Dict, salary_field: str,
                                    currency_rate: float):
            """Helper method to convert a single field - adds a new field with _pln ending

            Args:
                salary (Dict): nested document with salary info
                salary_field (str): name of the salary field
                currency_rate (float): Currency rate from target currency to PLN

            Returns:
                Dict: Updated salary chunk
            """
            if salary[salary_field] != 'undisclosed':
                salary[salary_field] = int(salary[salary_field])
                salary[f'{salary_field}_pln'] = int(
                    int(salary[salary_field]) * currency_rate)
            else:
                salary[f'{salary_field}_pln'] = 'undisclosed'
            return salary

        def map_single_salary(salary: Dict, currency_rate: float):
            """Maps all fields in a single salary field

            Args:
                salary (Dict): nested document with salary info
                currency_rate (float): Currency rate from target currency to PLN

            Returns:
                Dict: Updated salary chunk
            """
            salary = map_single_salary_field(salary, 'upper_range',
                                             currency_rate)
            salary = map_single_salary_field(salary, 'lower_range',
                                             currency_rate)
            if salary['upper_range'] != 'undisclosed':
                salary['average'] = (salary['upper_range'] +
                                     salary['lower_range']) / 2
            else:
                salary['average'] = 'undisclosed'
            salary = map_single_salary_field(salary, 'average', currency_rate)
            return salary

        offers = collection.find()

        for offer in tqdm.tqdm(offers, desc="Mapping the currencies to PLN"):
            try:
                new_salaries = []
                for salary in offer["salary"]:
                    new_salaries.append(
                        map_single_salary(salary, rates[salary['currency']]))
                collection.update_one({"id": offer['id']},
                                      {"$set": {
                                          "salary": new_salaries
                                      }})
            except IndexError:
                print("ERROR converting currencies")
Beispiel #19
0
def get_all_memes(memes_collection: pymongo.collection.Collection) -> list:
    """Index all the memes from the shelve database and display a list of memes to the user."""
    memes = memes_collection.find()
    return memes
Beispiel #20
0
def get_users(coll: pymongo.collection.Collection, **by):
    by_copy = dict(by)
    by_copy["type"] = "user"
    users = list(coll.find(by_copy))
    return users