Beispiel #1
0
def replace_with_correct_contig(mongo_source, assembly_accession, study_accession, incorrect_contig, correct_contig,
                                num_variants_to_replace):
    sve_collection = mongo_source.mongo_handle[mongo_source.db_name]["submittedVariantEntity"]
    filter_criteria = {'seq': assembly_accession, 'study': study_accession, 'contig': incorrect_contig}
    cursor = sve_collection.with_options(read_concern=ReadConcern("majority")) \
        .find(filter_criteria, no_cursor_timeout=True).limit(num_variants_to_replace)
    insert_statements = []
    drop_statements = []
    total_inserted, total_dropped = 0, 0
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant['_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                variant['_id'], original_id)
            variant['contig'] = correct_contig
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
        result_insert = sve_collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
            .bulk_write(requests=insert_statements, ordered=False)
        total_inserted += result_insert.inserted_count
        result_drop = sve_collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
            .bulk_write(requests=drop_statements, ordered=False)
        total_dropped += result_drop.deleted_count
        logger.info('%s / %s new documents inserted' % (total_inserted, num_variants_to_replace))
        logger.info('%s / %s old documents dropped' % (total_dropped, num_variants_to_replace))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()
    return total_inserted
Beispiel #2
0
def DupsRemover():
    pipeline = [{
        '$group': {
            '_id': '$Tweet',
            'count': {
                '$sum': 1
            },
            'ids': {
                '$push': '$_id'
            }
        }
    }, {
        '$match': {
            'count': {
                '$gte': 2
            }
        }
    }]
    request = []
    i = 0
    for nome in db2.collection_names():
        col = db2[nome]
        for document in col.aggregate(pipeline):
            it = iter(document['ids'])
            next(it)
            for id in it:
                request.append(pymongo.DeleteOne({'_id': id}))
                col.bulk_write(request)
                i = i + 1
                print 'delete: %s' % (i)
    pass
def deprecate(settings_xml_file, study, assembly_accession, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(get_mongo_uri_for_eva_profile('production', settings_xml_file)) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantEntity"]
        deprecated_sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantOperationEntity"]
        cursor = sve_collection.find({'seq': assembly_accession, 'study': study, 'contig': {'$in': contigs}})
        insert_statements = []
        drop_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(inactive_object(variant)))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    # There should only be 458 variant to deprecate
    assert len(insert_statements) == 458
    assert len(drop_statements) == 458

    logger.info('Found %s variant to deprecate', len(insert_statements))

    result_insert = deprecated_sve_collection.bulk_write(requests=insert_statements, ordered=False)
    result_drop = sve_collection.bulk_write(requests=drop_statements, ordered=False)
    logger.info('There was %s new documents inserted in inactive entities' % result_insert.inserted_count)
    logger.info('There was %s old documents dropped from ' % result_drop.deleted_count)
    accessioning_mongo_handle.close()
def correct(mongo_user, mongo_password, mongo_host, study, reference_source,
            reference_dest):
    """
    Connect to mongodb and retrieve all variants the should be updated, Check their key and update them in bulk.
    """
    with get_mongo_connection_handle(
            username=mongo_user, password=mongo_password,
            host=mongo_host) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle["eva_accession_sharded"][
            "submittedVariantEntity"]
        cursor = sve_collection.find({'study': study, 'seq': reference_source})
        insert_statements = []
        drop_statements = []
        record_checked = 0
        for variant in cursor:
            # Ensure that the variant we are changing has the expected SHA1
            original_id = get_SHA1(variant)
            assert variant[
                '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                    variant['_id'], original_id)
            variant['seq'] = reference_dest
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            record_checked += 1

        print('Retrieved %s documents and checked matching Sha1 hash' %
              record_checked)
        result_insert = sve_collection.bulk_write(requests=insert_statements,
                                                  ordered=False)
        print('There was %s new documents inserted' %
              result_insert.inserted_count)
        result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
        print('There was %s old documents dropped' % result_drop.deleted_count)
Beispiel #5
0
def get_insert_statements(sve_collection, contig_equivalents):
    wrong_contigs = list(contig_equivalents.keys())
    filter_criteria = {
        'seq': 'GCA_000001895.4',
        'study': 'PRJEB42012',
        'contig': {
            '$in': wrong_contigs
        }
    }
    cursor = sve_collection.with_options(
        read_concern=ReadConcern("majority")).find(filter_criteria,
                                                   no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant['_id'] == original_id, f"Original id is different from the one calculated " \
                                                  f"{variant['_id']} != {original_id}"
            variant['contig'] = contig_equivalents[variant['contig']]
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()

    return insert_statements, drop_statements
Beispiel #6
0
def deprecate(settings_xml_file, database_name, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(
            get_mongo_uri_for_eva_profile('production',
                                          settings_xml_file)) as mongo_handle:
        variant_collection = mongo_handle[database_name]['variants_2_0']
        deleted_variant_collection = mongo_handle[database_name][
            'to_delete_variants_2_0']

        cursor = variant_collection.find({'chr': {'$in': contigs}})
        drop_statements = []
        insert_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    logger.info('Found %s variant to remove', len(drop_statements))
    result_insert = deleted_variant_collection.bulk_write(
        requests=insert_statements, ordered=False)
    result_drop = variant_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
    logger.info('There was %s new documents inserted in to_delete collection' %
                result_insert.inserted_count)
    logger.info('There was %s documents dropped from ' %
                result_drop.deleted_count)

    mongo_handle.close()
Beispiel #7
0
    def test_successful_mutiple_queries(self):
        with tracer.start_active_span("test"):
            self.conn.test.records.bulk_write([pymongo.InsertOne({"type": "string"}),
                                               pymongo.UpdateOne({"type": "string"}, {"$set": {"type": "int"}}),
                                               pymongo.DeleteOne({"type": "string"})])

        assert_is_none(tracer.active_span)

        spans = self.recorder.queued_spans()
        self.assertEqual(len(spans), 4)

        test_span = spans.pop()

        seen_span_ids = set()
        commands = []
        for span in spans:
            self.assertEqual(test_span.t, span.t)
            self.assertEqual(span.p, test_span.s)

            # check if all spans got a unique id
            assert_false(span.s in seen_span_ids)

            seen_span_ids.add(span.s)
            commands.append(span.data["mongo"]["command"])

        # ensure spans are ordered the same way as commands
        assert_list_equal(commands, ["insert", "update", "delete"])
Beispiel #8
0
def bulkwrite(coll,
              objs,
              append=False):  # append is used during bson/json export
    if gen.dumpDir != None:
        if not os.path.isdir(gen.dumpDir):
            os.makedirs(gen.dumpDir, exist_ok=True)
        if not os.path.isdir(gen.dumpDir):
            raise FileNotFoundError(gen.dumpDir)
        ext: str = "bson" if gen.bsonMode else "json"
        outpath: str = os.path.join(gen.dumpDir, f"{coll.name}.{ext}")
        openmode = ("a" if append else "w") + ("b" if gen.bsonMode else "")
        with open(outpath, openmode if gen.bsonMode else "w") as f:
            for o in objs:
                if gen.bsonMode:
                    f.write(bson.encode(o.__dict__))
                else:
                    f.write(json.dumps(o.__dict__, default=str, indent=4))
        print(f"Colleciton {coll.name}: dumped to {outpath}")
    else:
        ledger = []
        for x in objs:
            ledger.append(pymongo.DeleteOne({"_id": x._id}))
            ledger.append(pymongo.InsertOne(x.__dict__))
        res = coll.bulk_write(ledger)
        print(f"Collection {coll.name}: {res.bulk_api_result}")
Beispiel #9
0
def find_duplicates_and_remove_them(mongo_user, mongo_password, mongo_host, mongo_database, assembly_accession,
                                    contig_list, study_list, dry_run):
    duplicates_to_remove_commands = []
    with get_mongo_connection_handle_url(
            username=mongo_user,
            password=mongo_password,
            host=mongo_host
    ) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle[mongo_database]["submittedVariantEntity"]
        try:
            cursor = sve_collection.find({
                "seq": assembly_accession, "contig": {"$in": contig_list}, "study": {"$in": study_list}
            }, no_cursor_timeout=True)
            for record in cursor:
                variants_with_accessions = list(accessioning_mongo_handle[mongo_database]["submittedVariantEntity"].find({'accession': record['accession']}))
                if len(variants_with_accessions) != 1:
                    print('Found %s duplicates for accession %s' % (len(variants_with_accessions), record['accession']) )
                for variant in variants_with_accessions:
                    if same_variant_except_contig(variant, record):
                        duplicates_to_remove_commands.append(
                            pymongo.DeleteOne({'contig': variant['contig'], 'accession': variant['accession']})
                        )
        except Exception as e:
            raise e
        finally:
            cursor.close()

        if dry_run:
            print("Will remove %s variants " % len(duplicates_to_remove_commands))
        else:
            sve_collection.bulk_write(requests=duplicates_to_remove_commands, ordered=False)
            print("Remove %s variants " % len(duplicates_to_remove_commands))
def do_updates(cve_collection, synonym_dictionaries, assembly_accession,
               chunk_size, number_of_variants_to_replace):
    cursor = cve_collection.find({'asm': assembly_accession},
                                 no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    record_checked = 0
    already_genbanks = 0
    total_inserted = 0
    total_dropped = 0
    logging.info("Performing updates...")
    try:
        for variant in cursor:
            # Ensure that the variant we are changing has the expected SHA1
            original_id = get_SHA1(variant)
            assert variant[
                '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                    variant['_id'], original_id)
            genbank, was_already_genbank = get_genbank(synonym_dictionaries,
                                                       variant['contig'])
            if was_already_genbank:
                already_genbanks += 1
            else:
                variant['contig'] = genbank
                variant['_id'] = get_SHA1(variant)
                insert_statements.append(pymongo.InsertOne(variant))
                drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            record_checked += 1
            if len(insert_statements) >= chunk_size:
                total_inserted, total_dropped = execute_bulk(
                    drop_statements, insert_statements, cve_collection,
                    total_dropped, total_inserted)
                logging.info('%s / %s new documents inserted' %
                             (total_inserted, number_of_variants_to_replace))
                logging.info('%s / %s old documents dropped' %
                             (total_dropped, number_of_variants_to_replace))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()

    if len(insert_statements) > 0:
        total_inserted, total_dropped = execute_bulk(drop_statements,
                                                     insert_statements,
                                                     cve_collection,
                                                     total_dropped,
                                                     total_inserted)
    logging.info('Retrieved %s documents and checked matching Sha1 hash' %
                 record_checked)
    logging.info(
        '{} of those documents had already a genbank contig. If the projects were all affected, '
        'that number should be 0, but even if it is not, there is nothing else to fix'
        .format(already_genbanks))
    logging.info('There was %s new documents inserted' % total_inserted)
    logging.info('There was %s old documents dropped' % total_dropped)
    return total_inserted
Beispiel #11
0
    def remove_leak(self, source_id, batch_size=10000):

        source = self.get_source(source_id)
        accounts_deleted = 0
        to_delete = []

        errprint('\n[*] Deleting leak "{}{}"'.format(
            source.name,
            ':{}'.format(source.hashtype) if source.hashtype else ''))

        try:

            # delete accounts
            for result in self.account_tags.find({'s': [source_id]},
                                                 {'_id': 1}):
                to_delete.append(pymongo.DeleteOne(result))
                if len(to_delete) % batch_size == 0:
                    accounts_deleted += self.accounts.bulk_write(
                        to_delete, ordered=False).deleted_count
                    to_delete.clear()
                    errprint(
                        '\r[+] Deleted {:,} accounts'.format(accounts_deleted),
                        end='')

            if to_delete:
                accounts_deleted += self.accounts.bulk_write(
                    to_delete, ordered=False).deleted_count

            # delete out of tags collection
            self.account_tags.delete_many({'s': [source_id]})
            # pull source ID from affected accounts
            self.account_tags.update_many({'s': source_id},
                                          {'$pull': {
                                              's': source_id
                                          }})

            errprint('\r[+] Deleted {:,} accounts'.format(accounts_deleted),
                     end='')

            self.sources.delete_many({'_id': source_id})
            self.counters.update_one({'collection': 'sources'},
                                     {'$unset': {
                                         str(source_id): ''
                                     }})

        except TypeError as e:
            errprint(str(e))
            errprint('[!] Can\'t find source "{}:{}"'.format(
                source.name, source.hashtype))

        errprint('\n[*] {:,} accounts deleted'.format(accounts_deleted))
        errprint('[*] Done')
        # sleep(1)
        return accounts_deleted
def test_collection_bulk_write(elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction('transaction.test')
    requests = [pymongo.InsertOne({'x': 1}),
                pymongo.DeleteOne({'x': 1}),
                pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction('transaction.test')
    transactions = elasticapm_client.instrumentation_store.get_all()
    span = _get_pymongo_trace(transactions[0]['spans'])
    assert span['type'] == 'db.mongodb.query'
    assert span['name'] == 'elasticapm_test.blogposts.bulk_write'
 def test_collection_bulk_write(self):
     self.client.begin_transaction('transaction.test')
     requests = [
         pymongo.InsertOne({'x': 1}),
         pymongo.DeleteOne({'x': 1}),
         pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)
     ]
     result = self.db.blogposts.bulk_write(requests)
     self.assertEqual(result.inserted_count, 1)
     self.assertEqual(result.deleted_count, 1)
     self.assertEqual(result.upserted_count, 1)
     self.client.end_transaction('transaction.test')
     transactions = self.client.instrumentation_store.get_all()
     trace = _get_pymongo_trace(transactions[0]['traces'])
     self.assertEqual(trace['type'], 'db.mongodb.query')
     self.assertEqual(trace['name'], 'elasticapm_test.blogposts.bulk_write')
Beispiel #14
0
def test_collection_bulk_write(instrument, elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction("transaction.test")
    requests = [
        pymongo.InsertOne({"x": 1}),
        pymongo.DeleteOne({"x": 1}),
        pymongo.ReplaceOne({"w": 1}, {"z": 1}, upsert=True),
    ]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction("transaction.test")
    transactions = elasticapm_client.transaction_store.get_all()
    span = _get_pymongo_trace(transactions[0]["spans"])
    assert span["type"] == "db.mongodb.query"
    assert span["name"] == "elasticapm_test.blogposts.bulk_write"
Beispiel #15
0
def move_past_conferences(db, from_table, to_table):
    cur_date = datetime.now().date()
    happened = [
        entry for entry in db[from_table].find()
        if (datetime.strptime(entry['Start Date'], "%d/%m/%y").date() -
            cur_date).days < 0
    ]
    if len(happened) > 0:
        db[to_table].bulk_write(
            [pymongo.InsertOne(entry) for entry in happened])
        in_to_table = list(
            filter(None, [db[to_table].find_one(entry) for entry in happened]))
        db[from_table].bulk_write(
            [pymongo.DeleteOne(entry) for entry in in_to_table])

        if in_to_table != happened:
            print(
                "For some reason some documents have not been moved to the past conferences table!"
            )
def replace_with_correct_contig(mongo_source):
    correct_contig = 'AF010406.1'
    sve_collection = mongo_source.mongo_handle[
        mongo_source.db_name]["submittedVariantEntity"]
    filter_criteria = {
        'seq': 'GCA_000298735.1',
        'study': 'PRJEB33693',
        'contig': '-'
    }
    cursor = sve_collection.find(filter_criteria, no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    number_of_variants_to_replace = 78
    total_inserted, total_dropped = 0, 0
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant[
                '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                    variant['_id'], original_id)
            variant['contig'] = correct_contig
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
        result_insert = sve_collection.bulk_write(requests=insert_statements,
                                                  ordered=False)
        total_inserted += result_insert.inserted_count
        result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
        total_dropped += result_drop.deleted_count
        logging.info('%s / %s new documents inserted' %
                     (total_inserted, number_of_variants_to_replace))
        logging.info('%s / %s old documents dropped' %
                     (total_dropped, number_of_variants_to_replace))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()
    return total_inserted
def correct(private_config_xml_file,
            profile='production',
            mongo_database='eva_accession_sharded'):
    with pymongo.MongoClient(
            get_mongo_uri_for_eva_profile(
                profile, private_config_xml_file)) as mongo_handle:
        sve_collection = mongo_handle[mongo_database]["submittedVariantEntity"]
        filter_criteria = {'seq': 'GCA_002742125.1', 'study': 'PRJEB42582'}
        cursor = sve_collection.find(filter_criteria)
        insert_statements = []
        drop_statements = []
        number_of_variants_to_replace = 10
        total_inserted, total_dropped = 0, 0
        try:
            for variant in cursor:
                original_id = get_SHA1(variant)
                assert variant[
                    '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                        variant['_id'], original_id)
                variant['contig'] = 'CM008482.1'
                variant['_id'] = get_SHA1(variant)
                insert_statements.append(pymongo.InsertOne(variant))
                drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            result_insert = sve_collection.bulk_write(
                requests=insert_statements, ordered=False)
            total_inserted += result_insert.inserted_count
            result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                    ordered=False)
            total_dropped += result_drop.deleted_count
            logging.info('%s / %s new documents inserted' %
                         (total_inserted, number_of_variants_to_replace))
            logging.info('%s / %s old documents dropped' %
                         (total_dropped, number_of_variants_to_replace))
        except Exception as e:
            print(traceback.format_exc())
            raise e
        finally:
            cursor.close()
        return total_inserted
Beispiel #18
0
def get_insert_statements(sve_collection, curr_contig, swap_contig):
    filter_criteria = {'seq': 'GCA_000003025.6', 'study': 'PRJEB28579', 'contig': curr_contig}
    cursor = sve_collection.with_options(read_concern=ReadConcern("majority")) \
        .find(filter_criteria, no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant['_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                variant['_id'], original_id)
            variant['contig'] = swap_contig
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()

    return insert_statements, drop_statements
def delete_variants_wront_contig(mongo_source):
    sve_collection = mongo_source.mongo_handle[
        mongo_source.db_name]["submittedVariantEntity"]
    filter_criteria = {
        'seq': 'GCA_000298735.1',
        'study': 'PRJEB33693',
        'contig': 'OARMT'
    }
    cursor = sve_collection.find(filter_criteria, no_cursor_timeout=True)
    drop_statements = []
    number_of_variants_to_drop, total_dropped = 78, 0
    try:
        for variant in cursor:
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))
        result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
        total_dropped += result_drop.deleted_count
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()
    return total_dropped
Beispiel #20
0
    batches = int(math.ceil(len(rm_oos) / 100.0))
    for i in range(0, batches):
        oos = rm_oos[i * 100:min((i + 1) * 100, len(rm_oos))]
        z = helpers.bulk(es, oos, chunk_size=100)
        print(min((i + 1) * 100, len(rm_oos)))

    ## Delete in Mongo
    client = MongoClient(
        'mongodb://*****:*****@ds119030-a0.mlab.com:19030,ds119030-a1.mlab.com:19030/glarket?replicaSet=rs-ds119030'
    )
    key = ''
    client = MongoClient(
        'mongodb://*****:*****@ds119030-a0.mlab.com:19030,ds119030-a1.mlab.com:19030/glarket?replicaSet=rs-ds119030'
    )
    db = client.glarket
    products = db.products
    batches = int(math.ceil(len(is_not_recent) / 50.0))
    for i in range(0, batches):
        delete = is_not_recent[i * 50:min((i + 1) * 50, len(is_not_recent))]
        reqs = [
            pymongo.DeleteOne({'product_link': x['_source']['product_link']})
            for x in delete
        ]
        try:
            result = products.bulk_write(reqs, ordered=False)
            print(result.bulk_api_result)
            print("Batch: " + str(i) + "/" + str(batches) + "; " +
                  str(i / (batches * 1.0) * 100) + "%")
        except BulkWriteError as bwe:
            print(bwe.details)
def replace_document_with_correct_information(mongo_source, collection_name,
                                              id_creation_func,
                                              filter_criteria, correction_map,
                                              batch_size):
    collection = mongo_source.mongo_handle[
        mongo_source.db_name][collection_name]

    total_inserted, total_dropped = 0, 0
    try:
        for batch_of_variant in find_documents_in_batch(
                mongo_source, collection_name, filter_criteria, batch_size):
            insert_statements = []
            drop_statements = []
            for variant in batch_of_variant:
                original_id = id_creation_func(variant)
                assert variant[
                    '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                        variant['_id'], original_id)
                for key in correction_map:
                    variant[key] = correction_map[key]
                variant['_id'] = id_creation_func(variant)
                insert_statements.append(pymongo.InsertOne(variant))
                drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            if insert_statements and drop_statements:
                try:
                    result_insert = collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000))\
                                              .bulk_write(requests=insert_statements, ordered=False)
                    total_inserted += result_insert.inserted_count
                    logger.debug(
                        f'{result_insert.inserted_count} new documents inserted in {collection_name}'
                    )
                except BulkWriteError as bulk_error:
                    error_code_names = set([
                        error.get('codeName')
                        for error in bulk_error.details.get('writeErrors')
                    ])
                    if len(error_code_names) == 1 and error_code_names.pop(
                    ) == 'DuplicateKey':
                        # This error occurs because we were able to create the entry in a previous run but not able to
                        # remove the original variant yet
                        total_inserted += bulk_error.details.get('nInserted')
                        logger.debug(
                            f"Duplicate key error found while inserting but still inserted "
                            f"{bulk_error.details.get('nInserted')} new documents inserted "
                            f"in {collection_name}")
                    else:
                        raise bulk_error

                result_drop = collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
                                        .bulk_write(requests=drop_statements, ordered=False)
                total_dropped += result_drop.deleted_count
                logger.debug(
                    f'{result_drop.deleted_count} old documents dropped in {collection_name}'
                )
            else:
                logger.warning(
                    f'{len(insert_statements)} insert statements and {len(drop_statements)} drop statements '
                    f'created. Skipping.')

        logger.info(
            f'{total_inserted} new documents inserted in {collection_name}')
        logger.info(
            f'{total_dropped} old documents dropped in {collection_name}')
    except Exception as e:
        print(traceback.format_exc())
        raise e
    return total_inserted
Beispiel #22
0

# Carrega senha para acesso ao banco de dados
config = configparser.ConfigParser()
config.read('config.ini')
password = config['MONGODB']['password']


# Cria conexao
client = pymongo.MongoClient("mongodb+srv://clodo:" + password + "@orgarq-bcluw.mongodb.net/test?retryWrites=true")

# Acessa a base de dados (ou cria ela)
db = client.tweets


pipeline = [{'$group': {'_id': '$tweet_id', 'count': {'$sum': 1}, 'ids': {'$push': '$_id'}}},
    {'$match': {'count': {'$gte': 2}}}]


bulk = db.tweets_resumidos.initialize_ordered_bulk_op()
requests = []
for document in db.tweets_resumidos.aggregate(pipeline):
    it = iter(document['ids'])
    next(it)
    for id in it:
        requests.append(pymongo.DeleteOne({'_id': id}))

db.tweets_resumidos.bulk_write(requests)


Beispiel #23
0
def merge_users(
    context, mongodb, username, password, authdb, host, port, loglevel, config, live
):
    """scout: manage interactions with a scout instance."""
    coloredlogs.install(level=loglevel)

    LOG.info("Running scout version %s", __version__)
    LOG.debug("Debug logging enabled.")

    mongo_config = {}
    cli_config = {}
    if config:
        LOG.debug("Use config file %s", config)
        with open(config, "r") as in_handle:
            cli_config = yaml.load(in_handle, Loader=yaml.FullLoader)

    mongo_config["mongodb"] = mongodb or cli_config.get("mongodb") or "scout"

    mongo_config["host"] = host or cli_config.get("host") or "localhost"
    mongo_config["port"] = port or cli_config.get("port") or 27017
    mongo_config["username"] = username or cli_config.get("username")
    mongo_config["password"] = password or cli_config.get("password")
    mongo_config["authdb"] = (
        authdb or cli_config.get("authdb") or mongo_config["mongodb"]
    )
    mongo_config["omim_api_key"] = cli_config.get("omim_api_key")

    # always dryrun for now
    dryrun = True
    if live:
        dryrun = False

    LOG.info("Setting database name to %s", mongo_config["mongodb"])
    LOG.debug("Setting host to %s", mongo_config["host"])
    LOG.debug("Setting port to %s", mongo_config["port"])

    valid_connection = check_connection(
        host=mongo_config["host"],
        port=mongo_config["port"],
        username=mongo_config["username"],
        password=mongo_config["password"],
        authdb=mongo_config["authdb"],
    )

    LOG.info("Test if mongod is running")
    if not valid_connection:
        LOG.warning("Connection could not be established")
        context.abort()

    try:
        client = get_connection(**mongo_config)
    except ConnectionFailure:
        context.abort()

    database = client[mongo_config["mongodb"]]

    LOG.info("Setting up a mongo adapter")
    mongo_config["client"] = client
    adapter = MongoAdapter(database)

    ## First, create all operation requests that would be needed in a live run.
    user_requests = []
    event_requests = []
    acmg_requests = []
    clinvar_requests = []
    clinvar_submission_requests = []
    case_requests = []

    for oi_user_obj in adapter.user_collection.find({"_id": {"$type": "objectId"}}):
        if not oi_user_obj.get("email"):
            continue

        LOG.info("===USER===")
        oi_user_id = oi_user_obj.get("_id")
        oi_user_email = oi_user_obj.get("email")
        LOG.info("user: {}".format(oi_user_obj))

        create_alt = False
        alt_user_obj = adapter.user_collection.find_one({"_id": oi_user_email})
        if not alt_user_obj:
            create_alt = True
            alt_user_obj = copy.deepcopy(oi_user_obj)
            alt_user_obj["_id"] = oi_user_email
        else:
            LOG.info("alt user: {}".format(alt_user_obj))
            merged_institutes = set()
            merged_institutes.update(
                alt_user_obj.get("institutes", []) + oi_user_obj.get("institutes", [])
            )
            LOG.info("merged institutes: {}".format(merged_institutes))
            alt_user_obj["institutes"] = list(merged_institutes)

            merged_roles = set()
            merged_roles.update(
                alt_user_obj.get("roles", []) + oi_user_obj.get("roles", [])
            )
            LOG.info("merged roles: {}".format(merged_roles))
            alt_user_obj["roles"] = list(merged_roles)

            created_at = oi_user_obj.get("created_at")
            alt_created_at = alt_user_obj.get("created_at")
            if (alt_created_at and created_at) and alt_created_at < created_at:
                created_at = alt_created_at

            if created_at:
                alt_user_obj["created_at"] = created_at

            accessed_at = alt_user_obj.get("accessed_at")
            oi_accessed_at = oi_user_obj.get("accessed_at")
            if (oi_accessed_at and accessed_at) and oi_accessed_at > accessed_at:
                accessed_at = oi_accessed_at

            if accessed_at:
                alt_user_obj["accessed_at"] = accessed_at

        if create_alt:
            LOG.info("create user: {}".format(alt_user_obj))
            operation = pymongo.InsertOne(alt_user_obj)
            user_requests.append(operation)
        else:
            LOG.info("update user: {}".format(alt_user_obj))
            alt_user_id = alt_user_obj.pop("_id")
            operation = pymongo.UpdateOne({"_id": alt_user_id}, {"$set": alt_user_obj})
            user_requests.append(operation)

        # finally, delete the oi user
        operation = pymongo.DeleteOne({"_id": ObjectId(str(oi_user_id))})
        user_requests.append(operation)

        ###
        ### events
        ###

        LOG.info("searching for events for user id {}".format(oi_user_id))
        oi_user_events = adapter.event_collection.find(
            {"user_id": ObjectId(str(oi_user_id))}
        )
        if oi_user_events.count() > 0:
            LOG.info("===EVENTS===")
        for event in oi_user_events:
            LOG.info("user event: {}".format(event))
            event_id = event.get("_id")
            operation = pymongo.UpdateOne(
                {"_id": event_id},
                {
                    "$set": {
                        "user_id": oi_user_email,
                        "user_name": alt_user_obj.get("name"),
                    }
                },
            )
            event_requests.append(operation)

        ###
        ### ACMG classifications
        ###
        LOG.info("searching for acmg for user id {}".format(oi_user_id))
        oi_user_acmg = adapter.acmg_collection.find(
            {"user_id": ObjectId(str(oi_user_id))}
        )
        if oi_user_acmg.count() > 0:
            LOG.info("===ACMG===")
            for acmg in oi_user_acmg:
                LOG.info("acmg: {}".format(acmg))
                operation = pymongo.UpdateOne(
                    {"_id": acmg.get("_id")},
                    {
                        "$set": {
                            "user_id": oi_user_email,
                            "user_name": alt_user_obj.get("name"),
                        }
                    },
                )
                acmg_requests.append(operation)

        # Clinvar
        LOG.info("searching for ClinVar for user id {}".format(oi_user_id))
        oi_user_clinvar = adapter.clinvar_collection.find(
            {"user": ObjectId(str(oi_user_id))}
        )
        if oi_user_clinvar.count() > 0:
            LOG.info("=== ClinVar ===")
            for clinvar in oi_user_clinvar:
                LOG.info("acmg: {}".format(clinvar))
                operation = pymongo.UpdateOne(
                    {"_id": clinvar.get("_id")}, {"$set": {"user": oi_user_email}}
                )
                clinvar_requests.append(operation)

        # clinvar_submission
        LOG.info("searching for clinvar submissions for user id {}".format(oi_user_id))
        oi_user_clinvars = adapter.clinvar_submission_collection.find(
            {"user_id": ObjectId(str(oi_user_id))}
        )
        if oi_user_clinvars.count() > 0:
            LOG.info("=== ClinVar submission ===")
            for clinvars in oi_user_clinvars:
                LOG.info("acmg: {}".format(clinvars))
                operation = pymongo.UpdateOne(
                    {"_id": clinvars.get("_id")}, {"$set": {"user_id": oi_user_email}}
                )
                clinvar_submission_requests.append(operation)

        ###
        ### cases
        ###
        LOG.info("searching for cases assigned to user id {}".format(oi_user_id))
        oi_user_cases = adapter.case_collection.find(
            {"assignees": ObjectId(str(oi_user_id))}
        )
        if oi_user_cases.count() > 0:
            LOG.info("=== Case assignees ===")
            for case in oi_user_cases:
                LOG.info("case {} assignees: {}".format(case["_id"], case["assignees"]))
                operation = pymongo.UpdateOne(
                    {"_id": case.get("_id"), "assignees": ObjectId(str(oi_user_id))},
                    {"$set": {"assignees.$": oi_user_email}},
                )
                case_requests.append(operation)

    else:
        LOG.info("No ObjectId ID user IDs found - nothing more to do.")

    # if everything worked out ok with dryrun, and after getting this far on a live run,
    # bulk write all proposed changes.
    if event_requests:
        LOG.info("event requests to execute: {}".format(event_requests))
        if not dryrun:
            result = adapter.event_collection.bulk_write(event_requests, ordered=False)

    if acmg_requests:
        LOG.info("acmg requests to execute: {}".format(acmg_requests))
        if not dryrun:
            result = adapter.acmg_collection.bulk_write(acmg_requests, ordered=False)
            LOG.info("Modified {} ACMG.".format(result.modified_count))

    if clinvar_requests:
        LOG.info("clinvar requests to execute: {}".format(clinvar_requests))
        if not dryrun:
            result = adapter.clinvar_collection.bulk_write(
                clinvar_requests, ordered=False
            )
            LOG.info("Modified {} ClinVar.".format(result.modified_count))

    if clinvar_submission_requests:
        LOG.info(
            "clinvar sub requests to execute: {}".format(clinvar_submission_requests)
        )
        if not dryrun:
            result = adapter.clinvar_submission_collection.bulk_write(
                clinvar_submission_requests, ordered=False
            )
            LOG.info("Modified {} ClinVar submissions.".format(result.modified_count))

    if case_requests:
        LOG.info("case requests to execute: {}".format(case_requests))
        if not dryrun:
            result = adapter.case_collection.bulk_write(case_requests, ordered=False)
            LOG.info("Modified {} case submissions.".format(result.modified_count))

    # now delete oi user, and actually update/create alt user
    if user_requests:
        LOG.info("user requests to execute: {}".format(user_requests))
        if not dryrun:
            result = adapter.user_collection.bulk_write(user_requests, ordered=False)
            LOG.info(
                "Modified users with the following: {}".format(result.bulk_api_result)
            )