Beispiel #1
0
def bulk_insert_collections(path,
                            indexed,
                            drop_on_start,
                            drop_on_exit=False,
                            write_concern=1):
    """

    :param path:
    :param indexed:
    :param drop_on_start:
    :param drop_on_exit:
    :param write_concern:
    :return:
    """
    if drop_on_start: drop_database_collections(DATABASE_COLLECTION)

    db = connect(HOST, PORT).get_database(DATABASE_COLLECTION)
    user_collection = db.get_collection(
        'users', write_concern=pymongo.WriteConcern(w=write_concern))
    tweet_collection = db.get_collection(
        'tweets', write_concern=pymongo.WriteConcern(w=write_concern))

    if indexed: create_indexes()
    else: remove_indexes()

    execution_time = 0

    document = open(path, 'r')

    users = []
    tweets = []

    for doc in document:
        d = json.loads(doc)
        users.append(d['user'])
        # add the user id to the tweet collection
        d['user_id'] = d['user']['id']
        del d['user']
        tweets.append(d)

    start_time = time.time()

    user_collection.insert_many(users)
    tweet_collection.insert_many(tweets)

    execution_time += time.time() - start_time

    size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2))
    logger.info("{} seconds to bulk_insert_collections {}".format(
        execution_time, size))

    if drop_on_exit: drop_database(DATABASE)

    return execution_time, size
Beispiel #2
0
def find_collections(indexed):
    """

    :param indexed:
    :return:
    """

    if indexed:
        create_indexes()

    db = connect(HOST, PORT).get_database(DATABASE_COLLECTION)

    user_collection = db.get_collection('users',
                                        write_concern=pymongo.WriteConcern())
    tweet_collection = db.get_collection('tweets',
                                         write_concern=pymongo.WriteConcern())

    execution_time = 0

    for i in range(5):
        count = 0

        start_time = time.time()

        #count += user_collection.find({'location': 'London'}).count()
        #count += user_collection.find({'friends_count': {'$gt': 1000}}).count()
        #count += user_collection.find({'followers_count': {'$gt': 1000}}).count()

        count += user_collection.find({
            '$and': [{
                'location': 'London'
            }, {
                'friends_count': {
                    '$gt': 1000
                }
            }, {
                'followers_count': {
                    '$gt': 1000
                }
            }]
        }).count()

        execution_time += time.time() - start_time

    logger.info("{} seconds to find_collections {} with indexed={}".format(
        execution_time, count, indexed))

    return execution_time, count
Beispiel #3
0
 def _setup_mongodb(self, logger):
     mongodb = self.config.logging.mongodb
     if mongodb:
         conn = self.config.sys.log
         if conn:
             conn = conn.connect()
             existing = conn.connection[
                 conn.database].list_collection_names()
             if conn.collection not in existing:
                 try:
                     conn.connection[conn.database].create_collection(
                         name=conn.name,
                         capped=True,
                         size=self.config.logging.size)
                     for idx in ("hostname", "identifier", "username",
                                 "qual_name"):
                         conn.create_index([("created", pymongo.DESCENDING),
                                            (idx, pymongo.DESCENDING)],
                                           name=idx)
                 except:
                     self.logger.warning("failed to create [sys.log]")
             level = getattr(logging, mongodb)
             write_concern = self.config.logging.write_concern
             handler = core4.logger.handler.MongoLoggingHandler(
                 conn.with_options(write_concern=pymongo.WriteConcern(
                     w=write_concern)))
             handler.setLevel(level)
             logger.addHandler(handler)
             self._setup_tornado(handler, level)
             self.logger.debug(
                 "mongodb logging setup complete, "
                 "level [%s], write concern [%d]", mongodb, write_concern)
         else:
             raise core4.error.Core4SetupError(
                 "config.logging.mongodb set, but config.sys.log is None")
Beispiel #4
0
 def insert_fn(remaining_secs):
     remaining_millis = int(round(remaining_secs * 1000))
     write_concern = pymongo.WriteConcern(w=2,
                                          wtimeout=remaining_millis)
     coll = client.resmoke.get_collection("await_ready",
                                          write_concern=write_concern)
     coll.insert_one({"awaiting": "ready"})
Beispiel #5
0
    def await_ready(self):
        """Block until the fixture can be used for testing."""
        # Wait for the config server
        if self.configsvr is not None:
            self.configsvr.await_ready()

        # Wait for each of the shards
        for shard in self.shards:
            shard.await_ready()

        # We call self._new_mongos() and mongos.setup() in self.await_ready() function
        # instead of self.setup() because mongos routers have to connect to a running cluster.
        if not self.mongos:
            for i in range(self.num_mongos):
                mongos = self._new_mongos(i, self.num_mongos)
                self.mongos.append(mongos)

        for mongos in self.mongos:
            # Start up the mongos.
            mongos.setup()

            # Wait for the mongos.
            mongos.await_ready()

        client = self.mongo_client()
        self._auth_to_db(client)

        # Turn off the balancer if it is not meant to be enabled.
        if not self.enable_balancer:
            self.stop_balancer()

        # Turn off autosplit if it is not meant to be enabled.
        if not self.enable_autosplit:
            wc = pymongo.WriteConcern(w="majority", wtimeout=30000)
            coll = client.config.get_collection("settings", write_concern=wc)
            coll.update_one({"_id": "autosplit"}, {"$set": {
                "enabled": False
            }},
                            upsert=True)

        # Inform mongos about each of the shards
        for shard in self.shards:
            self._add_shard(client, shard)

        # Ensure that all CSRS nodes are up to date. This is strictly needed for tests that use
        # multiple mongoses. In those cases, the first mongos initializes the contents of the config
        # database, but without waiting for those writes to replicate to all the config servers then
        # the secondary mongoses risk reading from a stale config server and seeing an empty config
        # database.
        self.configsvr.await_last_op_committed()

        # Enable sharding on each of the specified databases
        for db_name in self.enable_sharding:
            self.logger.info("Enabling sharding for '%s' database...", db_name)
            client.admin.command({"enablesharding": db_name})

        # Ensure that the sessions collection gets auto-sharded by the config server
        if self.configsvr is not None:
            primary = self.configsvr.get_primary().mongo_client()
            primary.admin.command({"refreshLogicalSessionCacheNow": 1})
Beispiel #6
0
def bulk_insert_one(path, drop_on_start, drop_on_exit=False, write_concern=1):
    """

    :param path:
    :param drop_on_start:
    :param drop_on_exit:
    :param write_concern:
    :return:
    """

    # drop database
    if drop_on_start: drop_database(DATABASE)

    db = connect(HOST, PORT).get_database(DATABASE)
    coll = db.get_collection(
        COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern))

    document = open(path, 'r')

    start = time.time()
    for doc in document:
        coll.insert_one(json.loads(doc))
    run = time.time() - start

    size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2))
    logger.info("{} seconds to bulk insert one {}".format(run, size))

    if drop_on_exit: drop_database(DATABASE)

    return run, size
Beispiel #7
0
def bulk_insert(path,
                indexed,
                drop_on_start,
                drop_on_exit=False,
                write_concern=1):
    """
    Bulk insert into MongoDB database

    :param path:
    :param indexed: insert into benchmark_db_indexed [default = False]
    :param drop_on_start:
    :param drop_on_exit:
    :param write_concern:
    :return:
    """

    # check drop flag
    if drop_on_start: drop_database(DATABASE)

    if indexed: create_indexes()
    else: remove_indexes()

    # connect to correct database:
    db = connect(HOST, PORT).get_database(DATABASE)
    coll = db.get_collection(
        COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern))

    document = open(path, 'r')

    docs = []

    for doc in document:
        docs.append(json.loads(doc))

    start = time.time()
    coll.insert_many(docs)
    run = time.time() - start

    if indexed:
        create_indexes()
    else:
        remove_indexes()

    size = "{}MB".format(round(os.path.getsize(path) / 1024 / 1024, 2))
    logger.info("{} seconds to bulk_insert {}, indexed={}".format(
        run, size, indexed))

    # check drop flag on exit
    if drop_on_exit: drop_database(DATABASE)

    return run, size
Beispiel #8
0
 def __init__(self, logger):
     # type: (Logging.Logger) -> None
     self._logger = logger
     try:
         self._mongo_connection = MongoConnection()
         self._client = self._mongo_connection.client()
         self._db = self._client.get_database(
             name=os.getenv('GREASE_MONGO_DB', 'grease'),
             write_concern=pymongo.WriteConcern(w=0))
         self._collection = self._db.get_collection(name='source_dedup')
         self._dedup = True
     except ServerSelectionTimeoutError:
         self._mongo_connection = None
         self._client = None
         self._db = None
         self._collection = None
         self._dedup = False
Beispiel #9
0
    def init_app(self, app: Flask, uri=None, db_name=None, **kwargs):
        self.app = app
        if uri:
            self.uri = uri
        if db_name:
            self.db_name = db_name
        if kwargs:
            self.kwargs.update(kwargs)

        if not self.uri:
            self.uri = app.config['MONGO_URL']

        self.app = app
        self.client = pymongo.MongoClient(self.uri, **self.kwargs)
        self.database = self.client.get_database(
            self.db_name, write_concern=pymongo.WriteConcern(w='majority'))
        self.gridfs = gridfs.GridFS(self.database)  # mongo 文件存储, coll: fs.*
        self.app.extensions['mongo'] = self
Beispiel #10
0
    def trigger(self, name, channel=None, data=None, author=None):
        """
        Triggers an event in collection ``sys.event``.

        This methods uses a special mongo connection with write concern ``0``.
        If the collection ``sys.event`` does not exist, it is created as a
        capped collection with size configured by key ``config.event.size``.

        :param name: of the event
        :param channel: of the event, defaults to channel name ``system``
        :param data: to be attached to the event
        :param author: of the event, defaults to the current username
        :return: event id (MongoDB ``_id``)
        """
        if self._event is None:
            conn = self.config.sys.event.connect(concurr=False)
            if conn:
                wc = self.config.event.write_concern
                conn.with_options(write_concern=pymongo.WriteConcern(w=wc))
                self.logger.debug(
                    "mongodb event setup complete, write concern [%d]", wc)
            else:
                raise core4.error.Core4SetupError("config.event not set")
            existing = conn.connection[
                conn.database].list_collection_names()
            if conn.collection not in existing:
                conn.connection[conn.database].create_collection(
                    name=conn.name,
                    capped=True,
                    size=self.config.event.size
                )
            self._event = conn
        doc = {
            "created": core4.util.node.mongo_now(),
            "name": name,
            "author": author or core4.util.node.get_username(),
            "channel": channel or core4.const.DEFAULT_CHANNEL
        }
        if data:
            doc["data"] = data
        inserted = self._event.insert_one(doc)
        return inserted.inserted_id
Beispiel #11
0
    def message_databases(self):
        """List of message databases, ordered by partition number."""

        kwargs = {}
        if not self.server_version < (2, 6):
            # NOTE(flaper87): Skip mongodb versions below 2.6 when
            # setting the write concern on the database. pymongo 3.0
            # fails with norepl when creating indexes.
            doc = self.connection.write_concern.document.copy()
            doc.setdefault('w', 'majority')
            doc.setdefault('j', False)
            kwargs['write_concern'] = pymongo.WriteConcern(**doc)

        name = self.mongodb_conf.database
        partitions = self.mongodb_conf.partitions

        databases = []
        for p in range(partitions):
            db_name = name + self._COL_SUFIX + str(p)
            databases.append(self.connection.get_database(db_name, **kwargs))
        return databases
 def deleteFile(self, file):
     """
     Delete all of the chunks in the collection that correspond to the
     given file.
     """
     q = {
         'chunkUuid': file['chunkUuid'],
         'assetstoreId': self.assetstore['_id']
     }
     matching = File().find(q, limit=2, projection=[])
     if matching.count(True) == 1:
         # If we can't reach the database, we return anyway.  A system check
         # will be necessary to remove the abandoned file.  Since we already
         # can handle that case, tell Mongo to use a 0 write concern -- we
         # don't need to know that the chunks have been deleted, and this
         # can be faster.
         try:
             self.chunkColl.with_options(write_concern=pymongo.WriteConcern(
                 w=0)).delete_many({'uuid': file['chunkUuid']})
         except pymongo.errors.AutoReconnect:
             pass
Beispiel #13
0
def mongo_save(database, collection_key, id_key, data):
    """Save results to MongoDB database.

    Args:
        database (:class:`pymongo.database.Database`): MongoDB database to save
            results to.
        collection_key (str): name of collection.
        id_key (str): id key with which to store :attr:`data`.
        data (:class:`bson.binary.Binary` or dict): data to store in
            :attr:`db`.
    """
    collection = database[collection_key].with_options(
        write_concern=pymongo.WriteConcern(w=1))
    tries_left = _MONGO_MAX_TRIES
    while tries_left > 0:
        tries_left -= 1
        try:
            collection.replace_one({'_id': id_key}, data, upsert=True)
            return
        except (pymongo.errors.WriteConcernError, pymongo.errors.WriteError):
            if tries_left == 0:
                print(f"Warning: could not write entry to mongodb after"
                      f" {_MONGO_MAX_TRIES} attempts.")
                raise
Beispiel #14
0
    def upsert_data(self, zipref, collection_name_prefix, filename, skiprows,
                    parse_dates, indice=[], na_values=["NIL", "/0"], drop_columns=["eNodeB Function Name"],
                    rename_columns={"Local cell name": "Cell Name"}):
        # skip if file read already
        myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL)
        mydb = myclient[self.DB_NAME]
        mycol = mydb["read_files"]
        if mycol.find_one({"_id": filename}) != None:
            print("{} has been loaded already".format(filename))
            myclient.close()
            return
        myclient.close()

        # read df
        st = time.time()
        file_extention = os.path.splitext(filename)[1]

        if file_extention == ".csv":
            # skip Total XXX Records
            temp = io.StringIO()
            with zipref.open(filename) as f:
                for line in f.readlines():
                    line = line.decode("utf-8")
                    if not line.startswith("Total"):
                        temp.write(line)
            temp.seek(0)
            df = pd.read_csv(temp, parse_dates=parse_dates, skiprows=skiprows, na_values=na_values)
        elif file_extention == ".xlsx":
            zipref.extract(filename, "extract")
            df = pd.read_excel(os.path.join("extract", filename), parse_dates=parse_dates, na_values=na_values)
        # if Date and Time are split into 2 columsn, combine them
        if "Date" in df.columns and "Time" in df.columns:
            df["Date"] = df.apply(lambda row: row["Date"].replace(" DST", ""), axis=1)
            df["Date"] = pd.to_datetime(df["Date"])
            df["Time"] = pd.to_timedelta(df["Time"])
            df["Time"] = df["Time"] - pd.to_timedelta(df["Time"].dt.days, unit='d')
            df["Time"] = df["Date"] + df["Time"]

        # rename columns
        df = df.rename(columns=rename_columns)

        # convert Cell name / Site name to string
        for col in ["Cell Name", "Site Name", "eNodeB Name", "gNodeB Nam"]:
            if col in df.columns:
                df = df.astype({col: str})
        # remove () in kpi name for 4g/5g
        if self.tech == "4G" or self.tech == "5G":
            p = re.compile("\(.+\)")
            columns = [p.sub("", x) for x in df.columns]
        # remove suffix after : in kpi name for 2g
        if self.tech == "2G":
            columns = [x.split(":")[0] for x in df.columns]
        df.columns = columns
        # drop columns
        df.drop(columns=[col for col in drop_columns if col in df.columns], inplace=True)

        print("load", time.time() - st)

        # # run agg
        # st = time.time()
        # if agg_function:
        #     agg = {}
        #     for field in agg_function:
        #         if field in df.columns:
        #             agg[field] = agg_function[field]
        #     df = df.groupby(parse_dates + unique_index, as_index=False).agg(agg)
        # print("agg", time.time() - st)

        # deal with auto_complete
        st = time.time()
        for auto_complete_field, auto_complete_collection in self.auto_complete_fields:
            if auto_complete_field in df.columns:
                s = set(df[auto_complete_field].unique()) - self.auto_complete_existed_sets[auto_complete_collection]
                self.auto_complete_existed_sets[auto_complete_collection].update(s)
                self.to_add_auto_complete_sets[auto_complete_collection].update(s)
        print("deal with auto_complete", time.time() - st)

        # divide df by time and insert onebyone
        st = time.time()
        df.columns = [x.replace(".", "_") for x in df.columns]
        time_col = parse_dates[0]
        for dt in df[time_col].unique():
            t_df = df[df[time_col] == dt]
            dt = dt.astype('M8[ms]').astype('O')
            if time_col == "Time":
                collection_name = collection_name_prefix + dt.strftime("%Y%m%d%H")
            if time_col == "Date":
                collection_name = collection_name_prefix + dt.strftime("%Y%m%d")
            data = t_df.to_dict(orient='records')  # Here's our added param..
            print("trans df with {} rows in {}:".format(len(data), collection_name), time.time() - st)
            st = time.time()
            myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL)
            mydb = myclient[self.DB_NAME]
            if collection_name not in mydb.list_collection_names():
                self.create_collection(collection_name, indice)
            mycol = mydb.get_collection(collection_name, write_concern=pymongo.WriteConcern(w=0))
            mycol.insert_many(data, ordered=False)
            myclient.close()
            print("Insert {} rows in {}".format(len(data), collection_name), time.time() - st)

        # insert filename in to read_files
        myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL)
        mydb = myclient[self.DB_NAME]
        mycol = mydb["read_files"]
        mycol.insert_one({"_id": filename})
        myclient.close()
        print("{} inserted".format(filename))
        self.insert_to_add_auto_complete_set()
Beispiel #15
0
 def process_obj(collection_name,
                 logger,
                 source_name,
                 source_max,
                 source_pointer,
                 field_set,
                 source_obj,
                 final,
                 strength=None):
     # first thing try to find the object level hash
     mongo_connection = MongoConnection()
     client = mongo_connection.client()
     db = client.get_database(name=os.getenv('GREASE_MONGO_DB', 'grease'),
                              write_concern=pymongo.WriteConcern(w=0))
     collection = db.get_collection(name=collection_name)
     hash_obj = collection.find_one(
         {'hash': SourceDeDuplify.generate_hash(source_obj)})
     if not hash_obj:
         logger.debug(
             "Failed To Locate Type1 Match, Performing Type2 Search Match",
             True)
         # Globally unique hash for request
         # create a completely new document hash and all the field set hashes
         collection.insert_one({
             'expiry':
             SourceDeDuplify.generate_expiry_time(),
             'max_expiry':
             SourceDeDuplify.generate_max_expiry_time(),
             'source':
             str(source_name),
             'score':
             1,
             'hash':
             SourceDeDuplify.generate_hash(source_obj),
             'type':
             1
         })
         # Next start field level processing
         # first check if our fields are limited
         if len(field_set) < 1:
             # All fields need to be considered for de-dup
             fields = source_obj.keys()
         else:
             # only registered fields
             fields = field_set
         # now lets get the composite score
         composite_score = SourceDeDuplify.get_field_score(
             collection, logger, source_name, source_obj, fields)
         if source_pointer is 0:
             compo_spot = 1
         else:
             compo_spot = source_pointer
         logger.debug("DEDUPLICATION COMPOSITE SCORE [" + str(compo_spot) +
                      "/" + str(source_max) + "]: " + str(composite_score))
         # now lets observe to see if we have a 'unique' source
         if strength is None:
             composite_score_limit = float(
                 os.getenv('GREASE_DEDUP_SCORE', 85))
         else:
             if isinstance(strength, int) or isinstance(strength, float):
                 logger.debug("Global DeDuplication strength override",
                              verbose=True)
                 composite_score_limit = float(strength)
             else:
                 composite_score_limit = float(
                     os.getenv('GREASE_DEDUP_SCORE', 85))
         if composite_score < composite_score_limit:
             # look at that its time to add it to the final list
             logger.debug("Type2 ruled Unique adding to final result", True)
             final.append(source_obj)
     else:
         # we have a duplicate source document
         # increase the counter and expiry and move on (DROP)
         logger.debug("Type1 match found, dropping", True)
         if 'max_expiry' in hash_obj:
             update_statement = {
                 "$set": {
                     'score': int(hash_obj['score']) + 1,
                     'expiry': SourceDeDuplify.generate_expiry_time()
                 }
             }
         else:
             update_statement = {
                 "$set": {
                     'score': int(hash_obj['score']) + 1,
                     'expiry': SourceDeDuplify.generate_expiry_time(),
                     'max_expiry':
                     SourceDeDuplify.generate_max_expiry_time()
                 }
             }
         collection.update_one({'_id': hash_obj['_id']}, update_statement)
     mongo_connection.client().close()
Beispiel #16
0
def create_fake(__document_class__,
                __db__=None,
                __parent__=None,
                __name__=None,
                __faker__=None,
                __depth__=DEFAULT_DEPTH,
                __write_concern__=pymongo.WriteConcern(w='majority'),
                **values):
    """ Create document with fake data.

    :param yadm.documents.BaseDocument __document_class__: document class
        for new instance
    :param yadm.database.Database __db__: database instance
        if specified, document and all references will be saved to database
    :param yadm.documents.BaseDocument __parent__: parent document
    :param str __name__: name of parent field
    :param Faker __faker__: faker instance, create if not specified
    :param int __depth__: maximum recursion depth,
        not recomendated use greater than 450
        (default 4)
    :return yadm.documents.BaseDocument: __document_class__ instance with fake data
    """
    if not issubclass(__document_class__, BaseDocument):
        raise TypeError("only BaseDocument subclasses is allowed")

    if __depth__ < 0:
        return AttributeNotSet

    if __faker__ is None:
        __faker__ = Faker()

    document = __document_class__()

    if isinstance(document, Document):
        document.__db__ = __db__
    elif isinstance(document, EmbeddedDocument):
        document.__parent__ = __parent__
        document.__name__ = __name__

    doc_fake_proc = document.__fake__(values, __faker__, __depth__ - 1)

    # extend values from __fake__ method
    if isinstance(doc_fake_proc, GeneratorType):
        values = next(doc_fake_proc)
    elif isinstance(doc_fake_proc, dict):
        values = doc_fake_proc

    # first: set values
    for name, fake in values.items():
        if fake is not AttributeNotSet:
            setattr(document, name, fake)

    # second: field faker
    for name, field in __document_class__.__fields__.items():
        if name not in values and not hasattr(document,
                                              '__fake__{}__'.format(name)):
            fake = field.get_fake(document, __faker__, __depth__ - 1)
            if fake is not AttributeNotSet:
                setattr(document, name, fake)

    # third: __fake__{name}__ methods
    for name, field in __document_class__.__fields__.items():
        if name not in values and hasattr(document,
                                          '__fake__{}__'.format(name)):
            attr = getattr(document, '__fake__{}__'.format(name))
            fake = attr(__faker__, __depth__ - 1)

            if fake is not AttributeNotSet:
                setattr(document, name, fake)

    if isinstance(doc_fake_proc, GeneratorType):
        # pre save processor
        try:
            next(doc_fake_proc)
        except StopIteration:
            doc_fake_proc = None

    if __db__ is not None:
        __db__.insert(document, write_concern=__write_concern__)

        # post save processor
        if isinstance(doc_fake_proc, GeneratorType):
            try:
                next(doc_fake_proc)
            except StopIteration:
                pass

    return document
Beispiel #17
0
def insert_one_collections(path,
                           indexed,
                           drop_on_start,
                           drop_on_exit=False,
                           write_concern=1):
    """
    Inserts a single document to the benchmark_db database

    :param path:
    :param indexed:
    :param drop_on_start:
    :param drop_on_exit:
    :param write_concern:
    :return:

       Parameters:
           indexed          - insert with indexes
           doc_path         - database document path
           drop_on_start    - drop database before query
           drop_on_exit     - drop database after query

       Returns:
           insert_one_time  - execution time for one insert
           bulk_insert_time - execution time for bulk insert
           doc_size         - size of the inserted document
           db_size          - size of the database"""

    if indexed: create_indexes()
    else: remove_indexes()

    if drop_on_start: drop_database_collections(DATABASE_COLLECTION)

    db = connect(HOST, PORT).get_database(DATABASE_COLLECTION)
    user_collection = db.get_collection(
        'users', write_concern=pymongo.WriteConcern(w=write_concern))
    tweet_collection = db.get_collection(
        'tweets', write_concern=pymongo.WriteConcern(w=write_concern))

    d1 = open(path, 'r')
    docs = []

    users = []
    tweets = []

    for doc in d1:
        d = json.loads(doc)
        users.append(d['user'])
        # add the user id to the tweet collection
        d['user_id'] = d['user']['id']
        del d['user']
        tweets.append(d)

    start = time.time()

    user_collection.insert_many(users)
    tweet_collection.insert_many(tweets)

    bulk_insert_time = time.time() - start

    d2 = open(DOCUMENT_SINGLE, 'r')

    for doc in d2:
        d = json.loads(doc)
        users.append(d['user'])
        # add the user id to the tweet collection
        d['user_id'] = d['user']['id']
        del d['user']
        tweets.append(d)

    start = time.time()

    user_collection.insert_one(users.pop())
    tweet_collection.insert_one(tweets.pop())

    insert_one_time = time.time() - start

    doc_size = "{}MB".format(
        round(os.path.getsize(DOCUMENT_SINGLE) / 1024 / 1024, 2))
    db_size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2))

    logger.info(
        "{} seconds to insert one collections indexed={} db_size={} doc_size={}"
        .format(insert_one_time, indexed, db_size, doc_size))

    if drop_on_exit: drop_database(DATABASE_COLLECTION)

    return insert_one_time, doc_size, db_size, bulk_insert_time
Beispiel #18
0
def insert_one(path,
               indexed,
               drop_on_start,
               drop_on_exit=False,
               write_concern=1):
    """
    Inserts a single document to the benchmark_db database

    :param path:
    :param indexed:
    :param drop_on_start:
    :param drop_on_exit:
    :param write_concern:
    :return:

       Parameters:
           indexed          - insert with indexes
           doc_path         - database document path
           drop_on_start    - drop database before query
           drop_on_exit     - drop database after query

       Returns:
           insert_one_time  - execution time for one insert
           bulk_insert_time - execution time for bulk insert
           doc_size         - size of the inserted document
           db_size          - size of the database"""

    if indexed: create_indexes()
    else: remove_indexes()

    if drop_on_start: drop_database(DATABASE)

    db = connect(HOST, PORT).get_database(DATABASE)
    coll = db.get_collection(
        COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern))

    d1 = open(path, 'r')
    docs = []

    for doc in d1:
        docs.append(json.loads(doc))

    d2 = open(DOCUMENT_SINGLE, 'r')
    single_doc = json.load(d2)

    start = time.time()
    coll.insert_many(docs)
    bulk_insert_time = time.time() - start

    start = time.time()
    coll.insert_one(single_doc)
    insert_one_time = time.time() - start

    doc_size = "{}MB".format(
        round(os.path.getsize(DOCUMENT_SINGLE) / 1024 / 1024, 2))
    db_size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2))

    logger.info(
        "{} seconds to insert one indexed={} db_size={} doc_size={}".format(
            insert_one_time, indexed, db_size, doc_size))

    if drop_on_exit: drop_database(DATABASE)

    return insert_one_time, doc_size, db_size, bulk_insert_time