def _sync_collection(self, dbname, collname):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = dbname, collname
        dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname)

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, dst_dbname, dst_collname))
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10

                for doc in cursor:
                    if self._conf.asyncio:
                        reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True))
                        if len(reqs) == reqs_max:
                            groups.append(reqs)
                            reqs = []
                        if len(groups) == groups_max:
                            threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i]) for i in xrange(groups_max)]
                            gevent.joinall(threads)
                            groups = []
                    else:
                        reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True))
                        if len(reqs) == reqs_max:
                            self._dst.bulk_write(dst_dbname, dst_collname, reqs)
                            reqs = []
                    n += 1
                    if n % 10000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))

                if self._conf.asyncio:
                    if len(groups) > 0:
                        threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i]) for i in xrange(len(groups))]
                        gevent.joinall(threads)
                    if len(reqs) > 0:
                        self._dst.bulk_write(dst_dbname, dst_collname, reqs)
                else:
                    if len(reqs) > 0:
                        self._dst.bulk_write(dst_dbname, dst_collname, reqs)

                log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()
Example #2
0
 def parse_jsons():
     cmd = {
         "planarity": sh.Command("make_restrictions_planarity",
                                 [BUILD_DIR]),
         "tree": sh.Command("make_restrictions_tree", [BUILD_DIR]),
         "matrix": sh.Command("make_restrictions_matrix", [BUILD_DIR])
     }
     for line in cmd[method](args, MATRIX_DIR, _iter=True):
         if not line.startswith(
             ("GRAPH:", "TREE:", "MATRIX:", "RESTRICTION:")):
             if len(line) > 500:
                 print("Can't parse line:", line[:250], "...", line[250:])
             else:
                 print("Can't parse line:", line)
             continue
         type, data = line.split(":", 1)
         data = json.loads(data)
         if type in ("GRAPH", "TREE"):
             print(
                 type,
                 db.data_sources.replace_one({
                     "id": data["id"]
                 },
                                             data,
                                             upsert=True).raw_result)
         elif type == "MATRIX":
             print(
                 type,
                 db.matrices.replace_one({
                     "id": data["id"]
                 },
                                         data,
                                         upsert=True).raw_result)
         else:
             yield pymongo.ReplaceOne({"id": data["id"]}, data, upsert=True)
Example #3
0
 def parse_jsons():
     if ttype in ("Zanetti", "JGraphEd", "GraphTea"):
         cmd = test_commands[ttype]("-n", ttype, all_restrictions,
                                    infiles)
         for l in cmd:
             j = json.loads(l)
             yield pymongo.ReplaceOne({"id": j["id"]}, j, upsert=True)
     else:
         for file in infiles:
             cmd = test_commands[ttype]("-n", ttype, all_restrictions,
                                        file)
             for l in cmd:
                 j = json.loads(l)
                 yield pymongo.ReplaceOne({"id": j["id"]},
                                          j,
                                          upsert=True)
Example #4
0
 def generateSmsWhitelistingData(tmpValue, mobile = '918660430751'):
     if constant.config['cluster'] in ['nightly', 'staging']:
         for i in range(0, 3):
             try:
                 testCol = constant.config['mongoConn']
                 value = {
                     "mobile": mobile, 
                     "delivered": 0, 
                     "not_delivered": 0
                 }
                 value.update(tmpValue)
                 value['total'] = value['delivered'] + value['not_delivered']
                 batchReq = []
                 batchReq.append(pymongo.ReplaceOne({'mobile': mobile}, value, upsert=True))
                 testCol.bulk_write(batchReq)
                 Logger.log(testCol.find({'mobile' : mobile})[0])
                 return
             except pymongo.errors.ConnectionFailure as e:
                 Logger.log(e)
                 port = constant.config['INTOUCH_DB_MONGO_MASTER']
                 if Utils.restartTunnel(port):
                     DarknightHelper.getMongoConnection('whitelisting', 'mobile_status')
                 else:
                     break
             except Exception as e:
                 break
         raise Exception(e)
Example #5
0
    def on_update(
        self,
        prefix: List[str],
        build_identifiers: BuildIdentifierSet,
        page_id: FileId,
        page: Page,
    ) -> None:
        super().on_update(prefix, build_identifiers, page_id, page)

        uploadable_assets = [
            asset for asset in page.static_assets if asset.can_upload()
        ]

        fully_qualified_pageid = "/".join(prefix + [page_id.without_known_suffix])

        # Construct filter for retrieving build documents
        document_filter: Dict[str, Union[str, Dict[str, Any]]] = {
            "page_id": fully_qualified_pageid,
            **construct_build_identifiers_filter(build_identifiers),
        }

        document = {
            "page_id": fully_qualified_pageid,
            **{
                key: value
                for (key, value) in build_identifiers.items()
                if value is not None
            },
            "prefix": prefix,
            "filename": page_id.as_posix(),
            "ast": page.ast.serialize(),
            "source": page.source,
            "created_at": datetime.utcnow(),
            "static_assets": [
                {"checksum": asset.get_checksum(), "key": asset.key}
                for asset in uploadable_assets
            ],
        }

        if page.query_fields:
            document.update({"query_fields": page.query_fields})

        self.pending_writes[COLL_DOCUMENTS].append(
            pymongo.ReplaceOne(document_filter, document, upsert=True)
        )

        for static_asset in uploadable_assets:
            self.pending_writes[COLL_ASSETS].append(
                pymongo.UpdateOne(
                    {"_id": static_asset.get_checksum()},
                    {
                        "$setOnInsert": {
                            "_id": static_asset.get_checksum(),
                            "data": static_asset.data,
                        }
                    },
                    upsert=True,
                )
            )
Example #6
0
    def _sync_collection_with_query(self, namespace_tuple, query, prog_q, res_q):
        """ Sync collection with query.
        """
        self._src.reconnect()
        self._dst.reconnect()

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(src_dbname, src_collname)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(filter=query,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           # snapshot cause blocking, maybe bug
                                                                           # modifiers={'$snapshot': True}
                                                                           )
                total = 0
                n = 0
                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10

                for doc in cursor:
                    reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self.bulk_write, self._dst.client(), dst_dbname, dst_collname, groups[i]) for i in range(groups_max)]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    total += 1
                    if n % 10000 == 0:
                        prog_q.put(n)
                        n = 0

                if len(groups) > 0:
                    threads = [gevent.spawn(self.bulk_write, self._dst.client(), dst_dbname, dst_collname, groups[i]) for i in range(len(groups))]
                    gevent.joinall(threads)
                if len(reqs) > 0:
                    self.bulk_write(self._dst.client(), dst_dbname, dst_collname, reqs)

                if n > 0:
                    prog_q.put(n)
                res_q.put(total)

                prog_q.close()
                prog_q.join_thread()
                res_q.close()
                res_q.join_thread()
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()
Example #7
0
def upsert_articles(articles: List[Article]):
    """
    Upserts a list of article documents
    """
    articles_upserts = [
        pymongo.ReplaceOne({'article_url': article.article_url},
                           article.__dict__,
                           upsert=True) for article in articles
    ]
    _DB_ARTICLES.bulk_write(articles_upserts)
Example #8
0
def bulk_replace_upsert(collection, key, docs):
    ops = []
    for doc in docs:
        ops.append(pymongo.ReplaceOne({key: doc[key]}, doc, upsert=True))
        if (len(ops) == 1000):
            # replace 1000 entries
            collection.bulk_write(ops, ordered=False)
            ops = []

    # replace remaining
    if (len(ops) > 0):
        collection.bulk_write(ops, ordered=False)
Example #9
0
    def insert_mine_compound(self, compound_dict=None, requests=None):
        """This method saves a RDKit Molecule as a compound entry in the MINE.
        Calculates necessary fields for API and includes additional
        information passed in the compound dict. Overwrites preexisting
        compounds in MINE on _id collision.

        :param compound_dict: Information of compound to insert to database
        :type compound_dict: dict
        :param requests: A list of requests for pymongo bulk write
        : type requests: list
        """

        if compound_dict is None:
            return None

        # Store all different representations of the molecule (SMILES, Formula,
        #  InChI key, etc.) as well as its properties in a dictionary
        if '_atom_count' in compound_dict:
            del compound_dict['_atom_count']

        if 'Inchikey' in compound_dict:
            del compound_dict['Inchikey']

        if 'ID' in compound_dict:
            del compound_dict['ID']
        # If the compound is a reactant, then make sure the reactant name is
        # in a correct format.
        if 'Reactant_in' in compound_dict and isinstance(
                compound_dict['Reactant_in'], str) \
                and compound_dict['Reactant_in']:
            compound_dict['Reactant_in'] = ast.literal_eval(
                compound_dict['Reactant_in'])
        # If the compound is a product, then make sure the reactant name is
        # in a correct format.
        if 'Product_of' in compound_dict \
                and isinstance(compound_dict['Product_of'], str) \
                and compound_dict['Product_of']:
            compound_dict['Product_of'] = ast.literal_eval(
                compound_dict['Product_of'])

        # If bulk insertion, upsert (insert and update) the database
        if requests != None:
            requests.append(
                pymongo.ReplaceOne({'_id': compound_dict['_id']},
                                   compound_dict,
                                   upsert=True))
        else:
            self.compounds.replace_one({'_id': compound_dict['_id']},
                                       compound_dict,
                                       upsert=True)

        return None
Example #10
0
    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        # create indexes first
        self._create_index(namespace_tuple)

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(src_dbname, src_collname)
        src_ns = '%s.%s' % (src_dbname, src_collname)

        total = self._src.client()[src_dbname][src_collname].count()
        self._progress_logger.register(src_ns, total)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})

                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10
                n = 0

                for doc in cursor:
                    reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self.bulk_write, self._dst.client(), dst_dbname, dst_collname, groups[i]) for i in range(groups_max)]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 10000 == 0:
                        self._progress_logger.add(src_ns, n)
                        n = 0

                if len(groups) > 0:
                    threads = [gevent.spawn(self.bulk_write, self._dst.client(), dst_dbname, dst_collname, groups[i]) for i in range(len(groups))]
                    gevent.joinall(threads)
                if len(reqs) > 0:
                    self.bulk_write(self._dst.client(), dst_dbname, dst_collname, reqs)

                self._progress_logger.add(src_ns, n, done=True)
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()
Example #11
0
def test_collection_bulk_write(elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction('transaction.test')
    requests = [pymongo.InsertOne({'x': 1}),
                pymongo.DeleteOne({'x': 1}),
                pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction('transaction.test')
    transactions = elasticapm_client.instrumentation_store.get_all()
    span = _get_pymongo_trace(transactions[0]['spans'])
    assert span['type'] == 'db.mongodb.query'
    assert span['name'] == 'elasticapm_test.blogposts.bulk_write'
Example #12
0
def update_sramongo_sra_records(docs, collection):
    db_operations = []
    for doc in docs:
        db_operations.append(pymongo.ReplaceOne({"srx": doc.srx}, doc.to_mongo(), upsert=True))

        # Write intermediate results
        if len(db_operations) > 500:
            res = collection.bulk_write(db_operations)
            logger.debug(res.bulk_api_result)
            db_operations = []

    if db_operations:
        res = collection.bulk_write(db_operations)
        logger.debug(res.bulk_api_result)
def update_station_information(stations, collection, batch_size=100):
    '''
    Bulk replace citi bike stations in MongoDB. New stations will be added automatically, changes in stations will be replaced.
    '''

    batched_operations = []
    for station in stations:
        batched_operations.append(pymongo.ReplaceOne(
            { '_id': station['_id'] },
            station,
            upsert=True))
        write_batch(batch=batched_operations, collection=collection, batch_size=batch_size, full_batch_required=True)

    # Don't forget the last batch that might not fill up the whole batch_size ;)
    write_batch(batch=batched_operations, collection=collection, batch_size=batch_size, full_batch_required=False)
 def test_collection_bulk_write(self):
     self.client.begin_transaction('transaction.test')
     requests = [
         pymongo.InsertOne({'x': 1}),
         pymongo.DeleteOne({'x': 1}),
         pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)
     ]
     result = self.db.blogposts.bulk_write(requests)
     self.assertEqual(result.inserted_count, 1)
     self.assertEqual(result.deleted_count, 1)
     self.assertEqual(result.upserted_count, 1)
     self.client.end_transaction('transaction.test')
     transactions = self.client.instrumentation_store.get_all()
     trace = _get_pymongo_trace(transactions[0]['traces'])
     self.assertEqual(trace['type'], 'db.mongodb.query')
     self.assertEqual(trace['name'], 'elasticapm_test.blogposts.bulk_write')
Example #15
0
def test_collection_bulk_write(instrument, elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction("transaction.test")
    requests = [
        pymongo.InsertOne({"x": 1}),
        pymongo.DeleteOne({"x": 1}),
        pymongo.ReplaceOne({"w": 1}, {"z": 1}, upsert=True),
    ]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction("transaction.test")
    transactions = elasticapm_client.transaction_store.get_all()
    span = _get_pymongo_trace(transactions[0]["spans"])
    assert span["type"] == "db.mongodb.query"
    assert span["name"] == "elasticapm_test.blogposts.bulk_write"
def update_operation_entities(mongo_source, collection_name, id_creation_func,
                              filter_criteria, correction_map, batch_size):
    collection = mongo_source.mongo_handle[
        mongo_source.db_name][collection_name]
    total_updated = 0
    try:
        for batch_of_variant in find_documents_in_batch(
                mongo_source, collection_name, filter_criteria, batch_size):
            update_statements = []
            for variant in batch_of_variant:
                filter_dict = {'_id': variant['_id']}
                for key in correction_map:
                    if callable(correction_map[key]):
                        variant[key] = correction_map[key](variant[key])
                    elif 'inactiveObjects' in key:
                        inactive_objects = variant['inactiveObjects']
                        prop = key.split('.')[-1]
                        for inactive in inactive_objects:
                            inactive[prop] = correction_map[key]
                    else:
                        variant[key] = correction_map[key]
                for inactive in variant['inactiveObjects']:
                    inactive['hashedMessage'] = id_creation_func(inactive)
                variant.pop('_id')
                update_statements.append(
                    pymongo.ReplaceOne(filter_dict, variant))
            if update_statements:
                result_update = collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
                    .bulk_write(requests=update_statements, ordered=False)
                total_updated += result_update.modified_count
                logger.debug(
                    f'{result_update.modified_count} documents updated in {collection_name}'
                )
            else:
                logger.warning(
                    f'{len(update_statements)} update statements created. Skipping.'
                )
        logger.info(f'{total_updated} documents updated in {collection_name}')
    except Exception as e:
        print(traceback.format_exc())
        raise e
    return total_updated
Example #17
0
 def save_all(self, items, extend=False):
     assert isinstance(items, list)
     ops = []
     for item in items:
         assert isinstance(item, Identifier)
         assert item.ean is not None or item.asin is not None
         fil = Identifier.filter_intersection(ean=item.ean, asin=item.asin)
         if extend:
             a = self.col.find_one(fil)
             if a is not None:
                 item = Identifier.extend(Identifier.from_obj(a), item)
         obj = item.to_obj()
         print('Save obj={o}.'.format(o=obj))
         ops.append(pymongo.ReplaceOne(fil, obj, upsert=True))
     if len(ops) == 0:
         return None
     else:
         res = self.col.bulk_write(ops)
         #print(result.bulk_api_result)
         return res
Example #18
0
def load(db_host, db_port, db_name, filename):
    """Load a full database dump from JSON"""

    backup_log("Connecting database")

    client = pymongo.MongoClient(db_host, db_port)
    db = client[db_name]

    backup_log("Loading data")

    with open(filename, "r") as file:
        data = json.load(file)

    backup_log("Storing data to database")

    for import_item in data:
        collection_name = import_item["collection"]

        collection = db[collection_name]
        requests = []

        for document in import_item["data"]:
            document["_id"] = bson.ObjectId(document["_id"])
            requests.append(
                pymongo.ReplaceOne({"uuid": document["uuid"]},
                                   document,
                                   upsert=True))

        size = len(requests)

        if size > 0:
            collection.bulk_write(requests)
        backup_log("Imported %i object%s into collection '%s'" %
                   (size, "s" if size != 1 else "", collection_name))

    backup_log("Done")

    return True
Example #19
0
    def insert_reaction(self, reaction_dict, requests=None):
        """Inserts a reaction into the MINE database and returns _id of the
         reaction in the mine database.

        :param reaction_dict: A dictionary containing 'Reactants' and
         'Products' lists of StoichTuples
        :type reaction_dict: dict        
        :return: The hashed _id of the reaction
        :rtype: str
        """

        reaction_dict = utils.convert_sets_to_lists(reaction_dict)
        # If bulk insertion, upsert (insert and update) the database
        if requests != None:
            requests.append(
                pymongo.ReplaceOne({'_id': {
                    '$eq': reaction_dict['_id']
                }},
                                   reaction_dict,
                                   upsert=True))
        else:
            save_document(self.reactions, reaction_dict)
        return reaction_dict['_id']
Example #20
0
    def write(self, symbol, item, metadata=None, chunker=DateChunker(), audit=None, **kwargs):
        """
        Writes data from item to symbol in the database

        Parameters
        ----------
        symbol: str
            the symbol that will be used to reference the written data
        item: Dataframe or Series
            the data to write the database
        metadata: ?
            optional per symbol metadata
        chunker: Object of type Chunker
            A chunker that chunks the data in item
        audit: dict
            audit information
        kwargs:
            optional keyword args that are passed to the chunker. Includes:
            chunk_size:
                used by chunker to break data into discrete chunks.
                see specific chunkers for more information about this param.
            func: function
                function to apply to each chunk before writing. Function
                can not modify the date column.
        """
        if not isinstance(item, (DataFrame, Series)):
            raise Exception("Can only chunk DataFrames and Series")

        self._arctic_lib.check_quota()

        previous_shas = []
        doc = {}
        meta = {}

        doc[SYMBOL] = symbol
        doc[LEN] = len(item)
        doc[SERIALIZER] = self.serializer.TYPE
        doc[CHUNKER] = chunker.TYPE
        doc[USERMETA] = metadata

        sym = self._get_symbol_info(symbol)
        if sym:
            previous_shas = set([Binary(x[SHA]) for x in self._collection.find({SYMBOL: symbol},
                                                                               projection={SHA: True, '_id': False},
                                                                               )])
        ops = []
        meta_ops = []
        chunk_count = 0

        for start, end, chunk_size, record in chunker.to_chunks(item, **kwargs):
            chunk_count += 1
            data = self.serializer.serialize(record)
            doc[CHUNK_SIZE] = chunk_size
            doc[METADATA] = {'columns': data[METADATA][COLUMNS] if COLUMNS in data[METADATA] else ''}
            meta = data[METADATA]

            for i in xrange(int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)):
                chunk = {DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE: (i + 1) * MAX_CHUNK_SIZE])}
                chunk[SEGMENT] = i
                chunk[START] = meta[START] = start
                chunk[END] = meta[END] = end
                chunk[SYMBOL] = meta[SYMBOL] = symbol
                dates = [chunker.chunk_to_str(start), chunker.chunk_to_str(end), str(chunk[SEGMENT]).encode('ascii')]
                chunk[SHA] = self._checksum(dates, chunk[DATA])

                meta_ops.append(pymongo.ReplaceOne({SYMBOL: symbol,
                                                    START: start,
                                                    END: end},
                                                   meta, upsert=True))

                if chunk[SHA] not in previous_shas:
                    ops.append(pymongo.UpdateOne({SYMBOL: symbol,
                                                  START: start,
                                                  END: end,
                                                  SEGMENT: chunk[SEGMENT]},
                                                 {'$set': chunk}, upsert=True))
                else:
                    # already exists, dont need to update in mongo
                    previous_shas.remove(chunk[SHA])

        if ops:
            self._collection.bulk_write(ops, ordered=False)
        if meta_ops:
            self._mdata.bulk_write(meta_ops, ordered=False)

        doc[CHUNK_COUNT] = chunk_count
        doc[APPEND_COUNT] = 0

        if previous_shas:
            mongo_retry(self._collection.delete_many)({SYMBOL: symbol, SHA: {'$in': list(previous_shas)}})

        mongo_retry(self._symbols.update_one)({SYMBOL: symbol},
                                              {'$set': doc},
                                              upsert=True)
        if audit is not None:
            audit['symbol'] = symbol
            audit['action'] = 'write'
            audit['chunks'] = chunk_count
            self._audit.insert_one(audit)
Example #21
0
    def _sync_collection(self, src_dbname, src_collname, dst_dbname,
                         dst_collname):
        """ Sync a collection through batch write.
        """
        while True:
            try:
                self._logger.info("sync collection '%s.%s'" %
                                  (src_dbname, src_collname))
                cursor = self._src_mc[src_dbname][src_collname].find(
                    filter=None,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    modifiers={'$snapshot': True})

                if self._src_engine == 'tokumx':
                    # TokuMX 'count' command may be very slow, use 'collStats' command instead
                    count = self._src_mc[src_dbname].command(
                        {'collStats': src_collname})['count']
                else:
                    count = cursor.count()

                if count == 0:
                    self._logger.info('\t skip empty collection')
                    return

                n = 0
                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10

                for doc in cursor:
                    if self._asyncio:
                        reqs.append(
                            pymongo.ReplaceOne({'_id': doc['_id']},
                                               doc,
                                               upsert=True))
                        if len(reqs) == reqs_max:
                            groups.append(reqs)
                            reqs = []
                        if len(groups) == groups_max:
                            threads = [
                                gevent.spawn(self._bulk_write, dst_dbname,
                                             dst_collname, groups[i])
                                for i in xrange(groups_max)
                            ]
                            gevent.joinall(threads)
                            groups = []
                    else:
                        reqs.append(
                            pymongo.ReplaceOne({'_id': doc['_id']},
                                               doc,
                                               upsert=True))
                        if len(reqs) == reqs_max:
                            self._bulk_write(dst_dbname, dst_collname, reqs)
                            reqs = []
                    n += 1
                    if n % 10000 == 0:
                        self._logger.info('\t %s.%s %d/%d (%.2f%%)' %
                                          (src_dbname, src_collname, n, count,
                                           float(n) / count * 100))

                if self._asyncio:
                    if len(groups) > 0:
                        threads = [
                            gevent.spawn(self._bulk_write, dst_dbname,
                                         dst_collname, groups[i])
                            for i in xrange(len(groups))
                        ]
                        gevent.joinall(threads)
                    if len(reqs) > 0:
                        self._bulk_write(dst_dbname, dst_collname, reqs)
                else:
                    if len(reqs) > 0:
                        self._bulk_write(dst_dbname, dst_collname, reqs)

                self._logger.info('\t %s.%s %d/%d (%.2f%%)' %
                                  (src_dbname, src_collname, n, count,
                                   float(n) / count * 100))
                return
            except pymongo.errors.AutoReconnect:
                self._src_mc.close()
                self._src_mc = self.reconnect(self._src_host,
                                              self._src_port,
                                              username=self._src_username,
                                              password=self._src_password)
Example #22
0
def export_pymongo(docs,
                   index,
                   mirrors=None,
                   update=False,
                   num_tries=3,
                   timeout=60,
                   chunksize=100):
    """Optimized :py:func:`~.export` function for pymongo index collections.

    The behavior of this function is rougly equivalent to:

    .. code-block:: python

        for doc in docs:
            export_one(doc, index, mirrors, num_tries)

    .. note::

        All index documents must be JSON-serializable to
        be able to be exported to a MongoDB collection.

    :param docs: The index documents to export.
    :param index: The database collection to export the index to.
    :type index: :class:`pymongo.collection.Collection`
    :param num_tries: The number of automatic retry attempts in case of
        mirror connection errors.
    :type num_tries: int
    :param timeout: The time in seconds to wait before an
        automatic retry attempt.
    :type timeout: int
    :param chunksize: The buffer size for export operations.
    :type chunksize: int"""
    import pymongo
    logger.info(
        "Exporting to pymongo database collection index '{}'.".format(index))
    chunk = []
    operations = []
    ids = defaultdict(list)
    for doc in docs:
        f = {'_id': doc['_id']}
        if update:
            root = doc.get('root')
            if root is not None:
                ids[root].append(doc['_id'])
        chunk.append(doc)
        operations.append(pymongo.ReplaceOne(f, doc, upsert=True))
        if len(chunk) >= chunksize:
            logger.debug("Pushing chunk.")
            _export_pymongo(chunk, operations, index, mirrors, num_tries,
                            timeout)
            chunk[:] = []
            operations[:] = []
    if len(operations):
        logger.debug("Pushing final chunk.")
        _export_pymongo(chunk, operations, index, mirrors, num_tries, timeout)
    if update:
        if ids:
            stale = set()
            for root in ids:
                docs_ = index.find({'root': root})
                all_ = {doc['_id'] for doc in docs_}
                stale.update(all_.difference(ids[root]))
            logger.info("Removing {} stale documents.".format(len(stale)))
            for _id in set(stale):
                index.delete_one(dict(_id=_id))
        else:
            raise errors.ExportError(
                "The exported docs sequence is empty! Unable to update!")
Example #23
0
            dels = delete[i * 100:min((i + 1) * 100, len(delete))]
            helpers.bulk(es, dels, chunk_size=100)
            print(min((i + 1) * 100, len(delete)))

if (is_okay):
    client = MongoClient(
        'mongodb://*****:*****@ds119030-a0.mlab.com:19030,ds119030-a1.mlab.com:19030/glarket?replicaSet=rs-ds119030'
    )
    db = client.glarket
    products = db.products
    batches = int(math.ceil(len(data) / 50.0))
    for i in range(0, batches):
        upload = data[i * 50:min((i + 1) * 50, len(data))]
        reqs = [
            pymongo.ReplaceOne({'product_link': x['product_link']},
                               x,
                               upsert=True) for x in upload
        ]
        try:
            result = products.bulk_write(reqs, ordered=False)
            print(result.bulk_api_result)
            print("Batch: " + str(i) + "/" + str(batches) + "; " +
                  str(i / (batches * 1.0) * 100) + "%")
        except BulkWriteError as bwe:
            print(bwe.details)

## Update ES
#### If prod_id exists, update variables
#### Else, upload
es = Elasticsearch(
    'https://*****:*****@fbc3032a2a91be69517a70b3d75f4eaa.us-east-1.aws.found.io:9243'
Example #24
0
 def parse_jsons():
     for l in cmd:
         j = json.loads(l)
         j["id"] = "/".join((j["file"], j["type"], str(j["index"])))
         yield pymongo.ReplaceOne({"id": j["id"]}, j, upsert=True)