Ejemplo n.º 1
0
def divide_researchers_into_2groups():
    col_author = connectTable("qiuzh", "mag_researchers0810")
    col1 = connectTable("qiuzh", "researchers0810_trainingset")
    col2 = connectTable('qiuzh', "researchers0810_testset")
    opt1 = []
    opt2 = []
    count = 0
    cursor = col_author.find(no_cursor_timeout=True)
    for researcher in col_author.find():
        count += 1
        if researcher["first_year"] <= 1996:
            opt1.append(pymongo.InsertOne(researcher))
        else:
            opt2.append(pymongo.InsertOne(researcher))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col1.bulk_write(opt1, ordered=False)
            print("已写入:", len(opt1), flush=True)
            col2.bulk_write(opt2, ordered=False)
            print("已写入:", len(opt2), flush=True)
            opt1 = []
            opt2 = []
    if opt1:
        col1.bulk_write(opt1, ordered=False)
        print("又写入:", len(opt1), flush=True)
    if opt2:
        col2.bulk_write(opt2, ordered=False)
        print("又写入:", len(opt2), flush=True)
    cursor.close()
Ejemplo n.º 2
0
def replace_with_correct_contig(mongo_source, assembly_accession, study_accession, incorrect_contig, correct_contig,
                                num_variants_to_replace):
    sve_collection = mongo_source.mongo_handle[mongo_source.db_name]["submittedVariantEntity"]
    filter_criteria = {'seq': assembly_accession, 'study': study_accession, 'contig': incorrect_contig}
    cursor = sve_collection.with_options(read_concern=ReadConcern("majority")) \
        .find(filter_criteria, no_cursor_timeout=True).limit(num_variants_to_replace)
    insert_statements = []
    drop_statements = []
    total_inserted, total_dropped = 0, 0
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant['_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                variant['_id'], original_id)
            variant['contig'] = correct_contig
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
        result_insert = sve_collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
            .bulk_write(requests=insert_statements, ordered=False)
        total_inserted += result_insert.inserted_count
        result_drop = sve_collection.with_options(write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
            .bulk_write(requests=drop_statements, ordered=False)
        total_dropped += result_drop.deleted_count
        logger.info('%s / %s new documents inserted' % (total_inserted, num_variants_to_replace))
        logger.info('%s / %s old documents dropped' % (total_dropped, num_variants_to_replace))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()
    return total_inserted
def deprecate(settings_xml_file, study, assembly_accession, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(get_mongo_uri_for_eva_profile('production', settings_xml_file)) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantEntity"]
        deprecated_sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantOperationEntity"]
        cursor = sve_collection.find({'seq': assembly_accession, 'study': study, 'contig': {'$in': contigs}})
        insert_statements = []
        drop_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(inactive_object(variant)))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    # There should only be 458 variant to deprecate
    assert len(insert_statements) == 458
    assert len(drop_statements) == 458

    logger.info('Found %s variant to deprecate', len(insert_statements))

    result_insert = deprecated_sve_collection.bulk_write(requests=insert_statements, ordered=False)
    result_drop = sve_collection.bulk_write(requests=drop_statements, ordered=False)
    logger.info('There was %s new documents inserted in inactive entities' % result_insert.inserted_count)
    logger.info('There was %s old documents dropped from ' % result_drop.deleted_count)
    accessioning_mongo_handle.close()
Ejemplo n.º 4
0
def do_create(lsst, nobjects=1000):
    try:
        lsst.drop_collection('y')
        print('y dropped')
    except:
        pass

    ra = 0
    decl = 0
    window = 180.

    stepper = st.Stepper()
    requests = []
    for i in range(nobjects):
        obj = {
            'loc': [(random.random() * 2 * window - window),
                    (random.random() * 2 * window - window)]
        }
        # lsst.y.insert( obj )
        requests.append(pymongo.InsertOne(obj))

    try:
        lsst.y.bulk_write(requests)
    except BulkWriteError as bwe:
        print('error in bulk write', bwe.details)
        exit()

    stepper.show_step('y created')

    # lsst.Object.aggregate( [ {'$match' : {'chunkId': 516} }, { '$project': { 'loc': [ '$ra', '$decl' ] } }, {'$limit': 1000}, {'$out': 'y'} ] )
    print(lsst.y.count())

    result = lsst.y.find()
    for i, o in enumerate(result):
        print(o)
        if i > 10:
            break

    stepper = st.Stepper()
    try:
        lsst.y.create_index([('loc.0', pymongo.ASCENDING)])
    except pymongo.errors.PyMongoError as e:
        print('error create index on ra', e)
    stepper.show_step('index loc.0 creation')

    stepper = st.Stepper()
    try:
        lsst.y.create_index([('loc.1', pymongo.ASCENDING)])
    except pymongo.errors.PyMongoError as e:
        print('error create index on decl', e)
    stepper.show_step('index loc.1 creation')

    stepper = st.Stepper()
    try:
        lsst.y.create_index([('loc', pymongo.GEO2D)])
    except pymongo.errors.PyMongoError as e:
        print('error create_geo_index', e)
    stepper.show_step('index loc creation')

    test9(lsst.y)
Ejemplo n.º 5
0
    def test_successful_mutiple_queries(self):
        with tracer.start_active_span("test"):
            self.conn.test.records.bulk_write([pymongo.InsertOne({"type": "string"}),
                                               pymongo.UpdateOne({"type": "string"}, {"$set": {"type": "int"}}),
                                               pymongo.DeleteOne({"type": "string"})])

        assert_is_none(tracer.active_span)

        spans = self.recorder.queued_spans()
        self.assertEqual(len(spans), 4)

        test_span = spans.pop()

        seen_span_ids = set()
        commands = []
        for span in spans:
            self.assertEqual(test_span.t, span.t)
            self.assertEqual(span.p, test_span.s)

            # check if all spans got a unique id
            assert_false(span.s in seen_span_ids)

            seen_span_ids.add(span.s)
            commands.append(span.data["mongo"]["command"])

        # ensure spans are ordered the same way as commands
        assert_list_equal(commands, ["insert", "update", "delete"])
Ejemplo n.º 6
0
def get_insert_statements(sve_collection, contig_equivalents):
    wrong_contigs = list(contig_equivalents.keys())
    filter_criteria = {
        'seq': 'GCA_000001895.4',
        'study': 'PRJEB42012',
        'contig': {
            '$in': wrong_contigs
        }
    }
    cursor = sve_collection.with_options(
        read_concern=ReadConcern("majority")).find(filter_criteria,
                                                   no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    try:
        for variant in cursor:
            original_id = get_SHA1(variant)
            assert variant['_id'] == original_id, f"Original id is different from the one calculated " \
                                                  f"{variant['_id']} != {original_id}"
            variant['contig'] = contig_equivalents[variant['contig']]
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()

    return insert_statements, drop_statements
Ejemplo n.º 7
0
    def _bulk_insert_events(self, event_col, descriptor, events, validate, ts):

        descriptor_uid = doc_or_uid_to_uid(descriptor)

        to_write = []
        for ev in events:
            data = dict(ev['data'])

            # Replace any filled data with the datum_id stashed in 'filled'.
            for k, v in six.iteritems(ev.get('filled', {})):
                if v:
                    data[k] = v
            # Convert any numpy types to native Python types.
            apply_to_dict_recursively(data, sanitize_np)
            timestamps = dict(ev['timestamps'])
            apply_to_dict_recursively(timestamps, sanitize_np)

            # check keys, this could be expensive
            if validate:
                if data.keys() != timestamps.keys():
                    raise ValueError(
                        BAD_KEYS_FMT.format(data.keys(),
                                            timestamps.keys()))
            ev_uid = ts + '-' + ev['uid']

            ev_out = dict(descriptor=descriptor_uid, uid=ev_uid,
                          data=data, timestamps=timestamps,
                          time=ev['time'],
                          seq_num=ev['seq_num'])

            to_write.append(pymongo.InsertOne(ev_out))

        event_col.bulk_write(to_write, ordered=True)
def correct(mongo_user, mongo_password, mongo_host, study, reference_source,
            reference_dest):
    """
    Connect to mongodb and retrieve all variants the should be updated, Check their key and update them in bulk.
    """
    with get_mongo_connection_handle(
            username=mongo_user, password=mongo_password,
            host=mongo_host) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle["eva_accession_sharded"][
            "submittedVariantEntity"]
        cursor = sve_collection.find({'study': study, 'seq': reference_source})
        insert_statements = []
        drop_statements = []
        record_checked = 0
        for variant in cursor:
            # Ensure that the variant we are changing has the expected SHA1
            original_id = get_SHA1(variant)
            assert variant[
                '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                    variant['_id'], original_id)
            variant['seq'] = reference_dest
            variant['_id'] = get_SHA1(variant)
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            record_checked += 1

        print('Retrieved %s documents and checked matching Sha1 hash' %
              record_checked)
        result_insert = sve_collection.bulk_write(requests=insert_statements,
                                                  ordered=False)
        print('There was %s new documents inserted' %
              result_insert.inserted_count)
        result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
        print('There was %s old documents dropped' % result_drop.deleted_count)
Ejemplo n.º 9
0
def bulkwrite(coll,
              objs,
              append=False):  # append is used during bson/json export
    if gen.dumpDir != None:
        if not os.path.isdir(gen.dumpDir):
            os.makedirs(gen.dumpDir, exist_ok=True)
        if not os.path.isdir(gen.dumpDir):
            raise FileNotFoundError(gen.dumpDir)
        ext: str = "bson" if gen.bsonMode else "json"
        outpath: str = os.path.join(gen.dumpDir, f"{coll.name}.{ext}")
        openmode = ("a" if append else "w") + ("b" if gen.bsonMode else "")
        with open(outpath, openmode if gen.bsonMode else "w") as f:
            for o in objs:
                if gen.bsonMode:
                    f.write(bson.encode(o.__dict__))
                else:
                    f.write(json.dumps(o.__dict__, default=str, indent=4))
        print(f"Colleciton {coll.name}: dumped to {outpath}")
    else:
        ledger = []
        for x in objs:
            ledger.append(pymongo.DeleteOne({"_id": x._id}))
            ledger.append(pymongo.InsertOne(x.__dict__))
        res = coll.bulk_write(ledger)
        print(f"Collection {coll.name}: {res.bulk_api_result}")
Ejemplo n.º 10
0
    def runTestTrialThread(self, testIdx):
        # Perform inserts
        batch = []
        errors = []
        runTime = 0
        client = pymongo.MongoClient(self.connString)
        mongoColl = client[self.dbName][self.collName].with_options(
            write_concern=pymongo.write_concern.WriteConcern(w=1)
        )
        for i in range(0, self.numDocsToInsert):
            batch.append(pymongo.InsertOne(self.documentProvider.createDocument(testIdx, i)))

            if i % self.insertBatchSize == 0:
                startTime = time.time()
                try:
                    mongoColl.bulk_write(batch, ordered=False)
                except pymongo.errors.BulkWriteError as e:
                    for x in e.details[u'writeErrors']:
                        error_id = x[u'op']['_id']
                        errors.append(get_document(batch, error_id))
                runTime += (time.time() - startTime)
                batch = []
        startTime = time.time()
        try:
            mongoColl.bulk_write(batch, ordered=False)
        except pymongo.errors.BulkWriteError as e:
            for x in e.details[u'writeErrors']:
                error_id = x[u'op']['_id']
                errors.append(get_document(batch, error_id))
        runTime += (time.time() - startTime)
        batch = []
        return runTime
Ejemplo n.º 11
0
def store_urls_mongo(url_docs):
    '''store url documents in mongodb.
    On error, this function mimicks the behavior of the invoke function where it
    returns a dict with a key "http_status_code" equal to 500.
    '''
    connect_str = os.getenv('MONGO_URI')
    try:
        client = pymongo.mongo_client.MongoClient(connect_str)
    except:
        return {
            "http_status_code": 500,
            "error": "MongoDB failed to connect to " + connect_str
        }

    db = client.url_shorten
    collection = db.url_shorten

    requests = [pymongo.InsertOne(doc) for doc in url_docs]

    ret = collection.bulk_write(requests)

    if ret.inserted_count != len(url_docs):
        return {
            "http_status_code":
            500,
            "error":
            "Some inserts failed: " + str(len(url_docs)) + " attempts, " +
            str(ret.inserted_count) + " successes"
        }

    return {"http_status_code": 200, "status": "success"}
Ejemplo n.º 12
0
def write_reactions_to_mine(reactions: List[dict],
                            db: MINE,
                            chunk_size: int = 10000) -> None:
    """Write reactions to reaction collection of MINE.

    Parameters
    ----------
    reactions : List[dict]
        Dictionary of reactions to write.
    db : MINE
        MINE object to write reactions with.
    chunk_size : int, optional
        Size of chunks to break reactions into when writing, by default 10000.
    """
    n_rxns = len(reactions)
    for i, rxn_chunk in enumerate(utils.Chunks(reactions, chunk_size)):
        if i % 20 == 0:
            print(
                f"Writing Reactions: Chunk {i} of {int(n_rxns/chunk_size) + 1}"
            )
        rxn_requests = [
            pymongo.InsertOne(utils.convert_sets_to_lists(rxn_dict))
            for rxn_dict in rxn_chunk
        ]

        db.reactions.bulk_write(rxn_requests, ordered=False)
Ejemplo n.º 13
0
    def _push_similarity(self, client):
        similarity_collection = client[DB_NAME][SIMILARITY_COLLECTION_NAME]
        similarity_collection.drop()

        bulk = []
        for game_id, similarities in self.game_similar.items():
            top_similar = similarities.most_common(SIMILARITY_CUTOFF)
            similar = []
            for item_id, score in top_similar:
                if item_id in self.game_ratings:
                    game_rating = self.game_ratings[item_id]
                    score = score * TEXT_SIMILARITY_FACTOR + (
                        game_rating / MAX_GAME_RATING) * RATING_FACTOR
                similar.append({'itemId': item_id, 'score': float(score)})

            bulk.append(
                pymongo.InsertOne({
                    'itemId':
                    game_id,
                    'similar':
                    sorted(similar, key=lambda x: x['score'], reverse=True)
                }))

        similarity_collection.bulk_write(bulk)
        similarity_collection.create_index([('itemId', pymongo.ASCENDING)],
                                           unique=True)
Ejemplo n.º 14
0
def deprecate(settings_xml_file, database_name, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(
            get_mongo_uri_for_eva_profile('production',
                                          settings_xml_file)) as mongo_handle:
        variant_collection = mongo_handle[database_name]['variants_2_0']
        deleted_variant_collection = mongo_handle[database_name][
            'to_delete_variants_2_0']

        cursor = variant_collection.find({'chr': {'$in': contigs}})
        drop_statements = []
        insert_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    logger.info('Found %s variant to remove', len(drop_statements))
    result_insert = deleted_variant_collection.bulk_write(
        requests=insert_statements, ordered=False)
    result_drop = variant_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
    logger.info('There was %s new documents inserted in to_delete collection' %
                result_insert.inserted_count)
    logger.info('There was %s documents dropped from ' %
                result_drop.deleted_count)

    mongo_handle.close()
Ejemplo n.º 15
0
def build_request(words, fields, schema):
    obj = dict()
    for item, word in enumerate(words):
        field = fields[item]

        if field not in schema.fields:
            print('error')
            return None

        ftype = schema.fields[field]

        if word == 'NULL':
            value = None
            continue

        try:
            if ftype == 'bit(1)':
                value = int(word)
            elif ftype == 'int(11)':
                value = int(word)
            elif ftype == 'bigint(20)':
                value = int(word)
            elif ftype == 'double':
                value = float(word)
            elif ftype == 'float':
                value = float(word)
        except:
            # we keep value as a string
            print('field:', field, 'value:', value)
            pass

        # print(field, '=', value)
        obj[field] = value

    return pymongo.InsertOne(obj)
def do_updates(cve_collection, synonym_dictionaries, assembly_accession,
               chunk_size, number_of_variants_to_replace):
    cursor = cve_collection.find({'asm': assembly_accession},
                                 no_cursor_timeout=True)
    insert_statements = []
    drop_statements = []
    record_checked = 0
    already_genbanks = 0
    total_inserted = 0
    total_dropped = 0
    logging.info("Performing updates...")
    try:
        for variant in cursor:
            # Ensure that the variant we are changing has the expected SHA1
            original_id = get_SHA1(variant)
            assert variant[
                '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                    variant['_id'], original_id)
            genbank, was_already_genbank = get_genbank(synonym_dictionaries,
                                                       variant['contig'])
            if was_already_genbank:
                already_genbanks += 1
            else:
                variant['contig'] = genbank
                variant['_id'] = get_SHA1(variant)
                insert_statements.append(pymongo.InsertOne(variant))
                drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            record_checked += 1
            if len(insert_statements) >= chunk_size:
                total_inserted, total_dropped = execute_bulk(
                    drop_statements, insert_statements, cve_collection,
                    total_dropped, total_inserted)
                logging.info('%s / %s new documents inserted' %
                             (total_inserted, number_of_variants_to_replace))
                logging.info('%s / %s old documents dropped' %
                             (total_dropped, number_of_variants_to_replace))
    except Exception as e:
        print(traceback.format_exc())
        raise e
    finally:
        cursor.close()

    if len(insert_statements) > 0:
        total_inserted, total_dropped = execute_bulk(drop_statements,
                                                     insert_statements,
                                                     cve_collection,
                                                     total_dropped,
                                                     total_inserted)
    logging.info('Retrieved %s documents and checked matching Sha1 hash' %
                 record_checked)
    logging.info(
        '{} of those documents had already a genbank contig. If the projects were all affected, '
        'that number should be 0, but even if it is not, there is nothing else to fix'
        .format(already_genbanks))
    logging.info('There was %s new documents inserted' % total_inserted)
    logging.info('There was %s old documents dropped' % total_dropped)
    return total_inserted
def main():
	json_file = open('../warehouses_de.geojson')
	data = json.load(json_file)
	batch = []
	for w in data['features']:
		batch.append(pymongo.InsertOne(w))
		write_batch(batch=batch, collection=warehouse_coll, full_batch_required=True)

	write_batch(batch=batch, collection=warehouse_coll, full_batch_required=False)
Ejemplo n.º 18
0
    def insert_data(
        self,
        data,
        tags=None,
    ):
        if tags is not None:
            for tag, tag_val in tags.items():
                data[tag] = tag_val

        self.enqueue(self.collection, pymongo.InsertOne(data))
Ejemplo n.º 19
0
def _get_cpd_insert(cpd_dict: dict):
    output_keys = [
        "_id",
        "ID",
        "SMILES",
        "InChI_key",
        "Type",
        "Generation",
        "Expand",
        "Reactant_in",
        "Product_of",
        "Matched_Peak_IDs",
        "Matched_Adducts",
        "Predicted_RT",
    ]

    # create Reactant_in
    reactant_in_requests = []
    product_of_requests = []
    insert_dict = {
        key: cpd_dict.get(key)
        for key in output_keys if cpd_dict.get(key) != None
    }
    if "Reactant_in" in insert_dict:
        chunked_reactant_in = _get_reactant_in_insert(cpd_dict)
        insert_dict["Reactant_in"] = []
        for r_in_dict in chunked_reactant_in:
            reactant_in_requests.append(pymongo.InsertOne(r_in_dict))
            insert_dict["Reactant_in"].append(r_in_dict["_id"])

    # create Product_of
    if "Product_of" in insert_dict:
        chunked_product_of = _get_product_of_insert(cpd_dict)
        insert_dict["Product_of"] = []
        for p_of_dict in chunked_product_of:
            product_of_requests.append(pymongo.InsertOne(p_of_dict))
            insert_dict["Product_of"].append(p_of_dict["_id"])

    cpd_request = pymongo.InsertOne(insert_dict)
    return cpd_request, reactant_in_requests, product_of_requests
Ejemplo n.º 20
0
    def insert_data(
        self,
        data,
        collection='generic',
        tags=None,
    ):
        if collection is None:
            raise errors.Invalid("cannot insert data: no collection given")

        if tags is not None:
            for tag, tag_val in tags.items():
                data[tag] = tag_val

        self.enqueue(collection, pymongo.InsertOne(data))
Ejemplo n.º 21
0
def test_collection_bulk_write(elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction('transaction.test')
    requests = [pymongo.InsertOne({'x': 1}),
                pymongo.DeleteOne({'x': 1}),
                pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction('transaction.test')
    transactions = elasticapm_client.instrumentation_store.get_all()
    span = _get_pymongo_trace(transactions[0]['spans'])
    assert span['type'] == 'db.mongodb.query'
    assert span['name'] == 'elasticapm_test.blogposts.bulk_write'
Ejemplo n.º 22
0
def insert_reaction(reaction_dict):
    """Inserts a reaction into the MINE database and returns _id of the
        reaction in the mine database. 

    :param reaction_dict: A dictionary containing 'Reactants' and
        'Products' lists of StoichTuples
    :type reaction_dict: dict    
    :return: Request for bulk insert
    :rtype: pymongo.InsertOne
    """

    reaction_dict = utils.convert_sets_to_lists(reaction_dict)

    return pymongo.InsertOne(reaction_dict)
def researchers_collaboration_network():
    '''
    there are some problems in researchers_con_innewcollection network, so we may use the other method to replace it
    i.e create a collaboration network first.
    :param begin:
    :param end:
    :param msg:
    :return:
    '''

    start_time = time()
    print(start_time, flush=True)
    col1 = connectTable("qiuzh", "mag_papers0510")
    col2 = connectTable("qiuzh", "mag_researchers0707")
    col3 = connectTable("qiuzh", "coauthor_network0722")
    operation = []
    cursor = col2.find(no_cursor_timeout=True)
    count = 0
    for i in cursor:
        count += 1
        author_id = i["_id"]
        # coauthor_times = 0
        # coauthor_list = []
        papers = i["new_pubs"]
        for paper in papers:
            paper_details = col1.find_one({"_id": paper},
                                          no_cursor_timeout=True)
            for author in paper_details["authors"]:
                if author["id"] != author_id and col2.find_one(
                    {"_id": author["id"]}, no_cursor_timeout=True):
                    # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]})
                    operation.append(
                        pymongo.InsertOne({
                            "author_id":
                            author_id,
                            "coauthor_id":
                            author["id"],
                            "coauthor_time":
                            paper_details["year"],
                        }))
        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col3.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col3.bulk_write(operation, ordered=False)
    print("已完成", len(operation), flush=True)
    print(time(), (time() - start_time), flush=True)
Ejemplo n.º 24
0
def add_coauthor_relation2newcollection():
    '''
    coauthor times and coauthor relationships
    :return:
    mag_authors0411:
    {coauthor_counts:n}
    {coauthor_list:[{year:1999,id:1000000},year:1998,id:1000001}]}
    because some of the authors in the dataset have too many collaborations and exceed the maximum RAM of a document,
    we store the relation in a new collection
    _id:
    "author_id" :
    "coauthor_id":
    "coauthor_time":
    '''

    start_time = time()
    print(start_time,flush=True)
    col1 = connectTable("qiuzh", "mag_papers0415")
    col2 = connectTable("qiuzh", "mag_authors0411")
    col3 = connectTable("qiuzh", "coauthor_network0420")
    operation = []
    cursor = col2.find(no_cursor_timeout=True)[3790001:]
    count =0
    for i in cursor:
        count+=1
        author_id = i["_id"]
        # coauthor_times = 0
        # coauthor_list = []
        papers = i["new_pubs"]
        for paper in papers:
            paper_details = col1.find_one({"_id": paper})
            # if paper_details:
            # coauthor_times += (len(paper_details["authors"]) - 1)
            for author in paper_details["authors"]:
                if author["id"] != author_id:
                    # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]})
                    operation.append(pymongo.InsertOne(
                        {"author_id": author_id, "coauthor_id": author["id"], "coauthor_time": paper_details["year"],
                         }))
        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col3.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col3.bulk_write(operation, ordered=False)
    print("已完成",len(operation),flush=True)
    print(time(), (time() - start_time), flush=True)
Ejemplo n.º 25
0
def bulk_insert_events(event_col, descriptor, events, validate):
    """Bulk insert many events

    Parameters
    ----------
    event_descriptor : dict or str
        The Descriptor to insert event for.  Can be either
        a dict with a 'uid' key or a uid string
    events : iterable
       iterable of dicts matching the bs.Event schema
    validate : bool
       If it should be checked that each pair of data/timestamps
       dicts has identical keys

    Returns
    -------
    ret : dict
        dictionary of details about the insertion
    """
    descriptor_uid = doc_or_uid_to_uid(descriptor)

    def event_factory():
        for ev in events:
            data = dict(ev['data'])
            # Replace any filled data with the datum_id stashed in 'filled'.
            for k, v in six.iteritems(ev.get('filled', {})):
                if v:
                    data[k] = v
            # Convert any numpy types to native Python types.
            apply_to_dict_recursively(data, sanitize_np)
            timestamps = dict(ev['timestamps'])
            apply_to_dict_recursively(timestamps, sanitize_np)
            # check keys, this could be expensive
            if validate:
                if data.keys() != timestamps.keys():
                    raise ValueError(
                        BAD_KEYS_FMT.format(data.keys(), timestamps.keys()))

            ev_out = dict(descriptor=descriptor_uid,
                          uid=ev['uid'],
                          data=data,
                          timestamps=timestamps,
                          time=ev['time'],
                          seq_num=ev['seq_num'])
            yield ev_out

    bulk = [pymongo.InsertOne(ev) for ev in event_factory()]
    return event_col.bulk_write(bulk, ordered=True)
def filter_researchers_paper_by_authors():
    '''
    from mag_researchers0707(pubs>=10, academic career life >=10) to mag_researchers0810(only the author number of a
    paper less than 10 will be considered in the dataset)
    :param msg:
    :param begin:
    :param end:
    :return:
    this function is created in 2021.8.10
    '''
    # col2 = connectTable('qiuzh', "mag_researchers0707")
    # col2.drop()
    col1 = connectTable('qiuzh', "mag_researchers0707")
    col2 = connectTable('qiuzh', "mag_researchers0810")
    col_paper = connectTable("qiuzh", "mag_papers0510")
    cursor = col1.find(no_cursor_timeout=True)
    opt = []
    count = 0
    print(cursor.count())
    for i in cursor:
        count += 1
        pubs = i["new_pubs"]
        new_pubs = []
        for pub in pubs:
            paper = col_paper.find_one({"_id": pub["pid"]})
            if len(paper["authors"]) <= 10:
                new_pubs.append(pub)
        opt.append(
            pymongo.InsertOne({
                "_id": i["_id"],
                "new_pubs": new_pubs,
                "pub_count": i["pub_count"],
                "first_year": i["first_year"],
                "last_year": i["last_year"],
                "cn": i["cn"]
            }))
        if count % 10000 == 0:
            print(len(opt))
            print(count)
            print("已处理:", count / 10000, flush=True)
            col2.bulk_write(opt, ordered=False)
            print("已写入:", count / 10000, flush=True)
            opt = []
    if opt:
        col2.bulk_write(opt, ordered=False)
        print("最终又完成", len(opt))
    print(count)
    cursor.close()
Ejemplo n.º 27
0
def filter_author_by_careerlife(begin,end,msg):
    '''
    :param msg:
    :param begin:
    :param end:
    :return: pubs>=10, org exist(affiliation)
    '''
    col1 = connectTable('qiuzh', "mag_authors0421")
    col2 = connectTable('qiuzh', "mag_authors0411")
    cursor = col2.find(no_cursor_timeout=True)[begin:end]
    opt =[]
    for i in cursor:
        if i["first_year"]-i["last_year"]>=20:
            opt.append(pymongo.InsertOne({"_id":i["_id"],"new_pubs":i["new_pubs"],"pub_count":i["pub_count"],"first_year":i["first_year"],"last_year":i["last_year"]}))
    col1.bulk_write(opt,ordered=False)
    cursor.close()
Ejemplo n.º 28
0
def filter_author_by_citation(begin, end,msg):
    '''
    :param msg: multi-process information
    :param begin: i-th
    :param end: i+1-th
    :return: pubs counts>=5
    '''
    col1 = connectTable("academic", "mag_authors")
    col2 = connectTable('qiuzh', "MAG_authors")
    opt = []
    # count = 0
    for i in col1.find({"n_pubs":{"$gte":5}})[begin: end]:
        a =i
        opt.append(pymongo.InsertOne(i))
    col2.bulk_write(opt, ordered=False)
    print("线程: %s, 遍历了 %s" % (msg, len(opt)))
Ejemplo n.º 29
0
 def test_collection_bulk_write(self):
     self.client.begin_transaction('transaction.test')
     requests = [
         pymongo.InsertOne({'x': 1}),
         pymongo.DeleteOne({'x': 1}),
         pymongo.ReplaceOne({'w': 1}, {'z': 1}, upsert=True)
     ]
     result = self.db.blogposts.bulk_write(requests)
     self.assertEqual(result.inserted_count, 1)
     self.assertEqual(result.deleted_count, 1)
     self.assertEqual(result.upserted_count, 1)
     self.client.end_transaction('transaction.test')
     transactions = self.client.instrumentation_store.get_all()
     trace = _get_pymongo_trace(transactions[0]['traces'])
     self.assertEqual(trace['type'], 'db.mongodb.query')
     self.assertEqual(trace['name'], 'elasticapm_test.blogposts.bulk_write')
Ejemplo n.º 30
0
def test_collection_bulk_write(instrument, elasticapm_client, mongo_database):
    elasticapm_client.begin_transaction("transaction.test")
    requests = [
        pymongo.InsertOne({"x": 1}),
        pymongo.DeleteOne({"x": 1}),
        pymongo.ReplaceOne({"w": 1}, {"z": 1}, upsert=True),
    ]
    result = mongo_database.blogposts.bulk_write(requests)
    assert result.inserted_count == 1
    assert result.deleted_count == 1
    assert result.upserted_count == 1
    elasticapm_client.end_transaction("transaction.test")
    transactions = elasticapm_client.transaction_store.get_all()
    span = _get_pymongo_trace(transactions[0]["spans"])
    assert span["type"] == "db.mongodb.query"
    assert span["name"] == "elasticapm_test.blogposts.bulk_write"